[llvm] [NVPTX] Add syncscope support for cmpxchg (PR #140812)

Akshay Deodhar via llvm-commits llvm-commits at lists.llvm.org
Tue May 20 15:26:36 PDT 2025


https://github.com/akshayrdeodhar created https://github.com/llvm/llvm-project/pull/140812

This MR adds support for cmpxchg instructions with syncscope.

Adds PatFrags for matching syncscope for 3-input atomic operations in the NVPTX backend.
Handle syncscope correctly for emulation loops in AtomicExpand, in bracketInstructionWithFences.
Modifies emitLeadingFence, emitTrailingFence to accept SyncScope as a parameter. Modifies implementation of these in other backends, with
Tests for all possible combinations of the cmpxchg instruction (with modifications to cmpxchg.py)

>From ba21c292b03d8a9aad729d15b46be373b21e4569 Mon Sep 17 00:00:00 2001
From: Akshay Deodhar <adeodhar at nvidia.com>
Date: Fri, 9 May 2025 01:00:43 +0000
Subject: [PATCH] [NVPTX] Add syncscope support for cmpxchg

---
 llvm/include/llvm/CodeGen/TargetLowering.h    |    16 +-
 llvm/lib/CodeGen/AtomicExpandPass.cpp         |    18 +-
 llvm/lib/CodeGen/TargetLoweringBase.cpp       |    10 +-
 llvm/lib/Target/ARM/ARMISelLowering.cpp       |     6 +-
 llvm/lib/Target/ARM/ARMISelLowering.h         |    10 +-
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp   |    13 +-
 llvm/lib/Target/NVPTX/NVPTXISelLowering.h     |    12 +-
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td      |    58 +-
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp   |     6 +-
 llvm/lib/Target/PowerPC/PPCISelLowering.h     |    12 +-
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |     6 +-
 llvm/lib/Target/RISCV/RISCVISelLowering.h     |    12 +-
 llvm/test/CodeGen/NVPTX/atomics-sm90.ll       |     8 +-
 llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll       | 16140 ++++++++++--
 llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll       | 16238 ++++++++++--
 llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll       | 21894 ++++++++++++++--
 llvm/test/CodeGen/NVPTX/cmpxchg.ll            |    40 +-
 llvm/test/CodeGen/NVPTX/cmpxchg.py            |    13 +-
 .../NVPTX/distributed-shared-cluster.ll       |    46 +-
 19 files changed, 47182 insertions(+), 7376 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 03099e9ad44dc..b2a75965e6c2e 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2319,13 +2319,15 @@ class TargetLoweringBase {
   ///   standard ABI uses a fence before a seq_cst load instead of after a
   ///   seq_cst store).
   /// @{
-  virtual Instruction *emitLeadingFence(IRBuilderBase &Builder,
-                                        Instruction *Inst,
-                                        AtomicOrdering Ord) const;
-
-  virtual Instruction *emitTrailingFence(IRBuilderBase &Builder,
-                                         Instruction *Inst,
-                                         AtomicOrdering Ord) const;
+  virtual Instruction *
+  emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
+                   AtomicOrdering Ord,
+                   SyncScope::ID SSID = SyncScope::System) const;
+
+  virtual Instruction *
+  emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
+                    AtomicOrdering Ord,
+                    SyncScope::ID SSID = SyncScope::System) const;
   /// @}
 
   // Emits code that executes when the comparison result in the ll/sc
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index c376de877ac7d..b8dcafa32052b 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -65,7 +65,8 @@ class AtomicExpandImpl {
   const DataLayout *DL = nullptr;
 
 private:
-  bool bracketInstWithFences(Instruction *I, AtomicOrdering Order);
+  bool bracketInstWithFences(Instruction *I, AtomicOrdering Order,
+                             SyncScope::ID SSID = SyncScope::System);
   IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL);
   LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI);
   bool tryExpandAtomicLoad(LoadInst *LI);
@@ -303,6 +304,7 @@ bool AtomicExpandImpl::processAtomicInstr(Instruction *I) {
 
   if (TLI->shouldInsertFencesForAtomic(I)) {
     auto FenceOrdering = AtomicOrdering::Monotonic;
+    SyncScope::ID SSID = SyncScope::System;
     if (LI && isAcquireOrStronger(LI->getOrdering())) {
       FenceOrdering = LI->getOrdering();
       LI->setOrdering(AtomicOrdering::Monotonic);
@@ -325,13 +327,18 @@ bool AtomicExpandImpl::processAtomicInstr(Instruction *I) {
       // expandAtomicCmpXchg in that case.
       FenceOrdering = CASI->getMergedOrdering();
       auto CASOrdering = TLI->atomicOperationOrderAfterFenceSplit(CASI);
+      SSID = CASI->getSyncScopeID();
 
       CASI->setSuccessOrdering(CASOrdering);
       CASI->setFailureOrdering(CASOrdering);
+      // If CAS ordering is monotonic, then the operation will
+      // take default scope. Otherwise, it will retain its scope
+      if (CASOrdering != AtomicOrdering::Monotonic)
+        CASI->setSyncScopeID(SSID);
     }
 
     if (FenceOrdering != AtomicOrdering::Monotonic) {
-      MadeChange |= bracketInstWithFences(I, FenceOrdering);
+      MadeChange |= bracketInstWithFences(I, FenceOrdering, SSID);
     }
   } else if (I->hasAtomicStore() &&
              TLI->shouldInsertTrailingFenceForAtomicStore(I)) {
@@ -432,12 +439,13 @@ PreservedAnalyses AtomicExpandPass::run(Function &F,
 }
 
 bool AtomicExpandImpl::bracketInstWithFences(Instruction *I,
-                                             AtomicOrdering Order) {
+                                             AtomicOrdering Order,
+                                             SyncScope::ID SSID) {
   ReplacementIRBuilder Builder(I, *DL);
 
-  auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order);
+  auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order, SSID);
 
-  auto TrailingFence = TLI->emitTrailingFence(Builder, I, Order);
+  auto TrailingFence = TLI->emitTrailingFence(Builder, I, Order, SSID);
   // We have a guard here because not every atomic operation generates a
   // trailing fence.
   if (TrailingFence)
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index c85f0c71ef25f..d0268545042ed 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -2327,18 +2327,20 @@ TargetLoweringBase::getAtomicMemOperandFlags(const Instruction &AI,
 
 Instruction *TargetLoweringBase::emitLeadingFence(IRBuilderBase &Builder,
                                                   Instruction *Inst,
-                                                  AtomicOrdering Ord) const {
+                                                  AtomicOrdering Ord,
+                                                  SyncScope::ID SSID) const {
   if (isReleaseOrStronger(Ord) && Inst->hasAtomicStore())
-    return Builder.CreateFence(Ord);
+    return Builder.CreateFence(Ord, SSID);
   else
     return nullptr;
 }
 
 Instruction *TargetLoweringBase::emitTrailingFence(IRBuilderBase &Builder,
                                                    Instruction *Inst,
-                                                   AtomicOrdering Ord) const {
+                                                   AtomicOrdering Ord,
+                                                   SyncScope::ID SSID) const {
   if (isAcquireOrStronger(Ord))
-    return Builder.CreateFence(Ord);
+    return Builder.CreateFence(Ord, SSID);
   else
     return nullptr;
 }
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index afbf1b4c55e70..5196ce846d6a2 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -21229,7 +21229,8 @@ Instruction *ARMTargetLowering::makeDMB(IRBuilderBase &Builder,
 // Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
 Instruction *ARMTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
                                                  Instruction *Inst,
-                                                 AtomicOrdering Ord) const {
+                                                 AtomicOrdering Ord,
+                                                 SyncScope::ID SSID) const {
   switch (Ord) {
   case AtomicOrdering::NotAtomic:
   case AtomicOrdering::Unordered:
@@ -21254,7 +21255,8 @@ Instruction *ARMTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
 
 Instruction *ARMTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
                                                   Instruction *Inst,
-                                                  AtomicOrdering Ord) const {
+                                                  AtomicOrdering Ord,
+                                                  SyncScope::ID SSID) const {
   switch (Ord) {
   case AtomicOrdering::NotAtomic:
   case AtomicOrdering::Unordered:
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 9fad056edd3f1..da09eca2b946f 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -666,10 +666,12 @@ class VectorType;
     void
     emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override;
 
-    Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
-                                  AtomicOrdering Ord) const override;
-    Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
-                                   AtomicOrdering Ord) const override;
+    Instruction *emitLeadingFence(
+        IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord,
+        SyncScope::ID SSID = SyncScope::ID SyncScope::System) const override;
+    Instruction *emitTrailingFence(
+        IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord,
+        SyncScope::ID SSID = SyncScope::ID SyncScope::System) const override;
 
     unsigned getMaxSupportedInterleaveFactor() const override;
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 1f417dbada8e6..0bf3e5dcdbf4e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -6311,7 +6311,8 @@ AtomicOrdering NVPTXTargetLowering::atomicOperationOrderAfterFenceSplit(
 
 Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
                                                    Instruction *Inst,
-                                                   AtomicOrdering Ord) const {
+                                                   AtomicOrdering Ord,
+                                                   SyncScope::ID SSID) const {
   if (!isa<AtomicCmpXchgInst>(Inst))
     return TargetLoweringBase::emitLeadingFence(Builder, Inst, Ord);
 
@@ -6319,15 +6320,17 @@ Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
   // Emit a fence.sc leading fence for cmpxchg seq_cst which are not emulated
   if (isReleaseOrStronger(Ord))
     return Ord == AtomicOrdering::SequentiallyConsistent
-               ? Builder.CreateFence(AtomicOrdering::SequentiallyConsistent)
-               : Builder.CreateFence(AtomicOrdering::Release);
+               ? Builder.CreateFence(AtomicOrdering::SequentiallyConsistent,
+                                     SSID)
+               : Builder.CreateFence(AtomicOrdering::Release, SSID);
 
   return nullptr;
 }
 
 Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
                                                     Instruction *Inst,
-                                                    AtomicOrdering Ord) const {
+                                                    AtomicOrdering Ord,
+                                                    SyncScope::ID SSID) const {
   // Specialize for cmpxchg
   if (!isa<AtomicCmpXchgInst>(Inst))
     return TargetLoweringBase::emitTrailingFence(Builder, Inst, Ord);
@@ -6340,7 +6343,7 @@ Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
   if (isAcquireOrStronger(Ord) &&
       (Ord != AtomicOrdering::SequentiallyConsistent ||
        CASWidth < STI.getMinCmpXchgSizeInBits()))
-    return Builder.CreateFence(AtomicOrdering::Acquire);
+    return Builder.CreateFence(AtomicOrdering::Acquire, SSID);
 
   return nullptr;
 }
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index b4b7dad984b62..3f494c9066140 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -280,10 +280,14 @@ class NVPTXTargetLowering : public TargetLowering {
   AtomicOrdering
   atomicOperationOrderAfterFenceSplit(const Instruction *I) const override;
 
-  Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
-                                AtomicOrdering Ord) const override;
-  Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
-                                 AtomicOrdering Ord) const override;
+  Instruction *
+  emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
+                   AtomicOrdering Ord,
+                   SyncScope::ID SSID = SyncScope::System) const override;
+  Instruction *
+  emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
+                    AtomicOrdering Ord,
+                    SyncScope::ID SSID = SyncScope::System) const override;
 
   unsigned getPreferredFPToIntOpcode(unsigned Op, EVT FromVT,
                                      EVT ToVT) const override;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 193418ca391e5..4dbcf6183efe9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -41,6 +41,27 @@ def AS_match {
   }];
 }
 
+multiclass nvvm_ternary_atomic_op_scoped<SDPatternOperator frag> {
+  defvar frag_pat = (frag node:$ptr, node:$cmp, node:$val);
+  def NAME#_cta: PatFrag<!setdagop(frag_pat, ops),
+      (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), [{
+        return Scopes[cast<MemSDNode>(N)->getSyncScopeID()] == NVPTX::Scope::Block;
+  }]>;
+  def NAME#_cluster : PatFrag<!setdagop(frag_pat, ops),
+      (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), [{
+        return Scopes[cast<MemSDNode>(N)->getSyncScopeID()] == NVPTX::Scope::Cluster;
+  }]>;
+  def NAME#_gpu: PatFrag<!setdagop(frag_pat, ops),
+      (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), [{
+        return Scopes[cast<MemSDNode>(N)->getSyncScopeID()] == NVPTX::Scope::Device;
+  }]>;
+  def NAME#_sys: PatFrag<!setdagop(frag_pat, ops),
+      (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), [{
+        return Scopes[cast<MemSDNode>(N)->getSyncScopeID()] == NVPTX::Scope::System;
+  }]>;
+}
+
+
 // A node that will be replaced with the current PTX version.
 class PTX {
   SDNodeXForm PTXVerXform = SDNodeXForm<imm, [{
@@ -2111,9 +2132,9 @@ multiclass F_ATOMIC_2<RegTyInfo t, string sem_str, string as_str, string op_str,
 }
 
 // has 3 operands
-multiclass F_ATOMIC_3<RegTyInfo t, string sem_str, string as_str, string op_str,
-                      SDPatternOperator op, list<Predicate> preds> {
-  defvar asm_str = "atom" # sem_str # as_str # "." # op_str # " \t$dst, [$addr], $b, $c;";
+multiclass F_ATOMIC_3<RegTyInfo t, string sem_str, string scope_str, string as_str, 
+            string op_str, SDPatternOperator op, list<Predicate> preds> {
+  defvar asm_str = "atom" # sem_str # scope_str # as_str # "." # op_str # " \t$dst, [$addr], $b, $c;";
   let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in {
     def rr : NVPTXInst<(outs t.RC:$dst),
       (ins ADDR:$addr, t.RC:$b, t.RC:$c),
@@ -2149,12 +2170,12 @@ multiclass F_ATOMIC_2_AS<RegTyInfo t, SDPatternOperator frag, string op_str, lis
   defm _GEN : F_ATOMIC_2<t, "", "", op_str, ATOMIC_GENERIC_CHK<frag_pat>, preds>;
 }
 
-multiclass F_ATOMIC_3_AS<RegTyInfo t, SDPatternOperator frag, string sem_str, string op_str, list<Predicate> preds = []> {
+multiclass F_ATOMIC_3_AS<RegTyInfo t, SDPatternOperator frag, string scope_str, string sem_str, string op_str, list<Predicate> preds = []> {
   defvar frag_pat = (frag node:$a, node:$b, node:$c);
-  defm _G : F_ATOMIC_3<t, sem_str, ".global", op_str, ATOMIC_GLOBAL_CHK<frag_pat>, preds>;
-  defm _S : F_ATOMIC_3<t, sem_str, ".shared", op_str, ATOMIC_SHARED_CHK<frag_pat>, preds>;
-  defm _S_C : F_ATOMIC_3<t, sem_str, ".shared::cluster", op_str, ATOMIC_SHARED_CLUSTER_CHK<frag_pat>, !listconcat([hasClusters], preds)>;
-  defm _GEN : F_ATOMIC_3<t, sem_str, "", op_str, ATOMIC_GENERIC_CHK<frag_pat>, preds>;
+  defm _G : F_ATOMIC_3<t, sem_str, scope_str, ".global", op_str, ATOMIC_GLOBAL_CHK<frag_pat>, preds>;
+  defm _S : F_ATOMIC_3<t, sem_str, scope_str, ".shared", op_str, ATOMIC_SHARED_CHK<frag_pat>, preds>;
+  defm _S_C : F_ATOMIC_3<t, sem_str, scope_str, ".shared::cluster", op_str, ATOMIC_SHARED_CLUSTER_CHK<frag_pat>, !listconcat([hasClusters], preds)>;
+  defm _GEN : F_ATOMIC_3<t, sem_str, scope_str, "", op_str, ATOMIC_GENERIC_CHK<frag_pat>, preds>;
 }
 
 // atom_add
@@ -2205,18 +2226,30 @@ foreach t = [I32RT, I64RT] in {
   foreach order = ["acquire", "release", "acq_rel", "monotonic"] in {
     defvar cas_order_string = !if(!eq(order, "monotonic"), ".relaxed", "."#order);
     defvar atomic_cmp_swap_pat = !cast<PatFrag>("atomic_cmp_swap_i"#t.Size#_#order);
+
+    // Instantiate scoped versions of the atomic compare and swap pattern
+    defm atomic_cmp_swap_i#t.Size#_#order: nvvm_ternary_atomic_op_scoped<atomic_cmp_swap_pat>;
+
+    foreach scope = ["cta", "cluster", "gpu", "sys"] in {
+      defvar atomic_cmp_swap_pat_scoped = !cast<PatFrag>("atomic_cmp_swap_i"#t.Size#_#order#_#scope);
+
+      // Syncscope is only supported for SM70+
+      defm INT_PTX_ATOM_CAS_#t.Size#_#order#_#scope
+        : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat_scoped, "."#scope, cas_order_string, "cas.b"#t.Size, [hasSM<70>, hasPTX<63>]>;
+    }
+
     // Note that AtomicExpand will convert cmpxchg seq_cst to a cmpxchg monotonic with fences around it.
     // Memory orders are only supported for SM70+, PTX63+- so we have two sets of instruction definitions-
     // for SM70+, and "old" ones which lower to "atom.cas", for earlier archs.
     defm INT_PTX_ATOM_CAS_#t.Size#_#order
-      : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat, cas_order_string, "cas.b"#t.Size, [hasSM<70>, hasPTX<63>]>;
+      : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat, "", cas_order_string, "cas.b"#t.Size, [hasSM<70>, hasPTX<63>]>;
     defm INT_PTX_ATOM_CAS_#t.Size#_#order#_old
-      : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat, "", "cas.b"#t.Size, []>;
+      : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat, "", "", "cas.b"#t.Size, []>;
   }
 }
 
 // Note that 16-bit CAS support in PTX is emulated.
-defm INT_PTX_ATOM_CAS_16 : F_ATOMIC_3_AS<I16RT, atomic_cmp_swap_i16, "", "cas.b16", [hasSM<70>, hasPTX<63>]>;
+defm INT_PTX_ATOM_CAS_16 : F_ATOMIC_3_AS<I16RT, atomic_cmp_swap_i16, "", "", "cas.b16", [hasSM<70>, hasPTX<63>]>;
 
 // Support for scoped atomic operations.  Matches
 // int_nvvm_atomic_{op}_{space}_{type}_{scope}
@@ -2246,7 +2279,8 @@ multiclass ATOM3N_impl<string OpStr, string IntTypeStr, string TypeStr,
                        RegTyInfo t, list<Predicate> Preds> {
   defm "" : F_ATOMIC_3<t,
                        as_str = !if(!eq(SpaceStr, "gen"), "", "." # SpaceStr),
-                       sem_str = !if(!eq(ScopeStr, "gpu"), "", "." # ScopeStr),
+                       sem_str = "",
+                       scope_str = !if(!eq(ScopeStr, "gpu"), "", "." # ScopeStr),
                        op_str = OpStr # "." # TypeStr,
                        op = !cast<Intrinsic>(
                               "int_nvvm_atomic_" # OpStr
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 59bfec30dc211..6dd67c76b7077 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -12588,7 +12588,8 @@ static Instruction *callIntrinsic(IRBuilderBase &Builder, Intrinsic::ID Id) {
 // http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
 Instruction *PPCTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
                                                  Instruction *Inst,
-                                                 AtomicOrdering Ord) const {
+                                                 AtomicOrdering Ord,
+                                                 SyncScope::ID SSID) const {
   if (Ord == AtomicOrdering::SequentiallyConsistent)
     return callIntrinsic(Builder, Intrinsic::ppc_sync);
   if (isReleaseOrStronger(Ord))
@@ -12598,7 +12599,8 @@ Instruction *PPCTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
 
 Instruction *PPCTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
                                                   Instruction *Inst,
-                                                  AtomicOrdering Ord) const {
+                                                  AtomicOrdering Ord,
+                                                  SyncScope::ID SSID) const {
   if (Inst->hasAtomicLoad() && isAcquireOrStronger(Ord)) {
     // See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and
     // http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index e7e7c21b50395..964f5e11f78cd 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -927,10 +927,14 @@ namespace llvm {
       return true;
     }
 
-    Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
-                                  AtomicOrdering Ord) const override;
-    Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
-                                   AtomicOrdering Ord) const override;
+    Instruction *
+    emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
+                     AtomicOrdering Ord,
+                     SyncScope::ID SSID = SyncScope::System) const override;
+    Instruction *
+    emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
+                      AtomicOrdering Ord,
+                      SyncScope::ID SSID = SyncScope::System) const override;
 
     bool shouldInlineQuadwordAtomics() const;
 
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 1158499718737..175153242ab0f 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -22800,7 +22800,8 @@ void RISCVTargetLowering::LowerAsmOperandForConstraint(
 
 Instruction *RISCVTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
                                                    Instruction *Inst,
-                                                   AtomicOrdering Ord) const {
+                                                   AtomicOrdering Ord,
+                                                   SyncScope::ID SSID) const {
   if (Subtarget.hasStdExtZtso()) {
     if (isa<LoadInst>(Inst) && Ord == AtomicOrdering::SequentiallyConsistent)
       return Builder.CreateFence(Ord);
@@ -22816,7 +22817,8 @@ Instruction *RISCVTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
 
 Instruction *RISCVTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
                                                     Instruction *Inst,
-                                                    AtomicOrdering Ord) const {
+                                                    AtomicOrdering Ord,
+                                                    SyncScope::ID SSID) const {
   if (Subtarget.hasStdExtZtso()) {
     if (isa<StoreInst>(Inst) && Ord == AtomicOrdering::SequentiallyConsistent)
       return Builder.CreateFence(Ord);
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index fc8d8b8ce1b56..2a1c4ea0cad37 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -225,10 +225,14 @@ class RISCVTargetLowering : public TargetLowering {
   // than this hook due to limitations in the interface here.
   bool shouldInsertFencesForAtomic(const Instruction *I) const override;
 
-  Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
-                                AtomicOrdering Ord) const override;
-  Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
-                                 AtomicOrdering Ord) const override;
+  Instruction *
+  emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
+                   AtomicOrdering Ord,
+                   SyncScope::ID SSID = SyncScope::System) const override;
+  Instruction *
+  emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
+                    AtomicOrdering Ord,
+                    SyncScope::ID SSID = SyncScope::System) const override;
 
   bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
                                   EVT VT) const override;
diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
index b5a4f94611453..54dfe2eb2bca5 100644
--- a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
@@ -71,7 +71,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
 ; CHECKPTX71-NEXT:    shl.b32 %r30, %r29, %r2;
 ; CHECKPTX71-NEXT:    and.b32 %r31, %r54, %r3;
 ; CHECKPTX71-NEXT:    or.b32 %r32, %r31, %r30;
-; CHECKPTX71-NEXT:    atom.relaxed.cas.b32 %r6, [%r1], %r54, %r32;
+; CHECKPTX71-NEXT:    atom.relaxed.sys.cas.b32 %r6, [%r1], %r54, %r32;
 ; CHECKPTX71-NEXT:    setp.ne.s32 %p1, %r6, %r54;
 ; CHECKPTX71-NEXT:    mov.b32 %r54, %r6;
 ; CHECKPTX71-NEXT:    @%p1 bra $L__BB0_1;
@@ -87,7 +87,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
 ; CHECKPTX71-NEXT:    shl.b32 %r35, %r34, %r2;
 ; CHECKPTX71-NEXT:    and.b32 %r36, %r55, %r3;
 ; CHECKPTX71-NEXT:    or.b32 %r37, %r36, %r35;
-; CHECKPTX71-NEXT:    atom.relaxed.cas.b32 %r9, [%r1], %r55, %r37;
+; CHECKPTX71-NEXT:    atom.relaxed.sys.cas.b32 %r9, [%r1], %r55, %r37;
 ; CHECKPTX71-NEXT:    setp.ne.s32 %p2, %r9, %r55;
 ; CHECKPTX71-NEXT:    mov.b32 %r55, %r9;
 ; CHECKPTX71-NEXT:    @%p2 bra $L__BB0_3;
@@ -109,7 +109,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
 ; CHECKPTX71-NEXT:    shl.b32 %r43, %r42, %r11;
 ; CHECKPTX71-NEXT:    and.b32 %r44, %r56, %r12;
 ; CHECKPTX71-NEXT:    or.b32 %r45, %r44, %r43;
-; CHECKPTX71-NEXT:    atom.relaxed.global.cas.b32 %r15, [%r10], %r56, %r45;
+; CHECKPTX71-NEXT:    atom.relaxed.sys.global.cas.b32 %r15, [%r10], %r56, %r45;
 ; CHECKPTX71-NEXT:    setp.ne.s32 %p3, %r15, %r56;
 ; CHECKPTX71-NEXT:    mov.b32 %r56, %r15;
 ; CHECKPTX71-NEXT:    @%p3 bra $L__BB0_5;
@@ -131,7 +131,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
 ; CHECKPTX71-NEXT:    shl.b32 %r51, %r50, %r17;
 ; CHECKPTX71-NEXT:    and.b32 %r52, %r57, %r18;
 ; CHECKPTX71-NEXT:    or.b32 %r53, %r52, %r51;
-; CHECKPTX71-NEXT:    atom.relaxed.shared.cas.b32 %r21, [%r16], %r57, %r53;
+; CHECKPTX71-NEXT:    atom.relaxed.sys.shared.cas.b32 %r21, [%r16], %r57, %r53;
 ; CHECKPTX71-NEXT:    setp.ne.s32 %p4, %r21, %r57;
 ; CHECKPTX71-NEXT:    mov.b32 %r57, %r21;
 ; CHECKPTX71-NEXT:    @%p4 bra $L__BB0_7;
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll
index 65a077d67e4ba..146bc10756b23 100644
--- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll
@@ -2,8 +2,8 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_60 -mattr=+ptx50 | FileCheck %s --check-prefix=SM60
 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_60 -mattr=+ptx50 | %ptxas-verify -arch=sm_60 %}
 
-define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: monotonic_monotonic_i8_generic(
+define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_monotonic_i8_generic_sys(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -11,8 +11,8 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_sys_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -23,7 +23,7 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [monotonic_monotonic_i8_generic_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [monotonic_monotonic_i8_generic_sys_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM60-NEXT:    ld.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
@@ -43,12 +43,12 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:  $L__BB0_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic
     ret i8 %new
 }
 
-define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: monotonic_monotonic_i8_global(
+define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_monotonic_i8_generic_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -56,8 +56,8 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_global_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_cta_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -68,15 +68,15 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [monotonic_monotonic_i8_global_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [monotonic_monotonic_i8_generic_cta_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB1_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM60-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB1_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -88,12 +88,12 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM60-NEXT:  $L__BB1_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic
     ret i8 %new
 }
 
-define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: monotonic_monotonic_i8_shared(
+define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_monotonic_i8_generic_gpu(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -101,8 +101,8 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_gpu_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -113,15 +113,15 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [monotonic_monotonic_i8_shared_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [monotonic_monotonic_i8_generic_gpu_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB2_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM60-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB2_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -133,12 +133,12 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne
 ; SM60-NEXT:  $L__BB2_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic
     ret i8 %new
 }
 
-define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: monotonic_acquire_i8_generic(
+define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_monotonic_i8_global_sys(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -146,8 +146,8 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_generic_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_global_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_global_sys_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -158,15 +158,15 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [monotonic_acquire_i8_generic_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [monotonic_monotonic_i8_global_sys_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.b32 %r16, [%rd1];
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB3_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM60-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB3_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -176,15 +176,14 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB3_1;
 ; SM60-NEXT:  $L__BB3_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic
     ret i8 %new
 }
 
-define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: monotonic_acquire_i8_global(
+define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_monotonic_i8_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -192,8 +191,8 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_global_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cta_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -204,7 +203,7 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [monotonic_acquire_i8_global_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [monotonic_monotonic_i8_global_cta_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
@@ -222,15 +221,14 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB4_1;
 ; SM60-NEXT:  $L__BB4_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic
     ret i8 %new
 }
 
-define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: monotonic_acquire_i8_shared(
+define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_monotonic_i8_global_gpu(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -238,8 +236,8 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_shared_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_global_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_global_gpu_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -250,15 +248,15 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [monotonic_acquire_i8_shared_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [monotonic_monotonic_i8_global_gpu_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB5_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM60-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB5_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -268,15 +266,14 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB5_1;
 ; SM60-NEXT:  $L__BB5_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic
     ret i8 %new
 }
 
-define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: monotonic_seq_cst_i8_generic(
+define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_monotonic_i8_shared_sys(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -284,9 +281,8 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_param_0];
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_sys_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -297,15 +293,15 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [monotonic_monotonic_i8_shared_sys_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.b32 %r16, [%rd1];
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB6_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM60-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB6_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -315,15 +311,14 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB6_1;
 ; SM60-NEXT:  $L__BB6_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic
     ret i8 %new
 }
 
-define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: monotonic_seq_cst_i8_global(
+define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_monotonic_i8_shared_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -331,9 +326,8 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_param_0];
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_cta_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -344,15 +338,15 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [monotonic_seq_cst_i8_global_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [monotonic_monotonic_i8_shared_cta_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB7_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM60-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB7_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -362,15 +356,14 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB7_1;
 ; SM60-NEXT:  $L__BB7_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic
     ret i8 %new
 }
 
-define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: monotonic_seq_cst_i8_shared(
+define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_monotonic_i8_shared_gpu(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -378,9 +371,8 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_param_0];
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_gpu_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -391,7 +383,7 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [monotonic_monotonic_i8_shared_gpu_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
@@ -409,15 +401,14 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB8_1;
 ; SM60-NEXT:  $L__BB8_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic
     ret i8 %new
 }
 
-define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acquire_monotonic_i8_generic(
+define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_acquire_i8_generic_sys(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -425,8 +416,8 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_generic_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_generic_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_generic_sys_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -437,7 +428,7 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [acquire_monotonic_i8_generic_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [monotonic_acquire_i8_generic_sys_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM60-NEXT:    ld.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
@@ -458,12 +449,12 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire
     ret i8 %new
 }
 
-define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acquire_monotonic_i8_global(
+define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_acquire_i8_generic_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -471,8 +462,8 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_global_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_generic_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_generic_cta_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -483,15 +474,15 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [acquire_monotonic_i8_global_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [monotonic_acquire_i8_generic_cta_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB10_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM60-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB10_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -501,15 +492,15 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB10_1;
 ; SM60-NEXT:  $L__BB10_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire
     ret i8 %new
 }
 
-define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acquire_monotonic_i8_shared(
+define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_acquire_i8_generic_gpu(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -517,8 +508,8 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_shared_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_generic_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_generic_gpu_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -529,15 +520,15 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [acquire_monotonic_i8_shared_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [monotonic_acquire_i8_generic_gpu_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB11_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM60-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB11_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -547,15 +538,15 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB11_1;
 ; SM60-NEXT:  $L__BB11_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    membar.gl;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire
     ret i8 %new
 }
 
-define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acquire_acquire_i8_generic(
+define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_acquire_i8_global_sys(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -563,8 +554,8 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_generic_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_global_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_global_sys_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -575,15 +566,15 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [acquire_acquire_i8_generic_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [monotonic_acquire_i8_global_sys_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.b32 %r16, [%rd1];
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB12_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM60-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB12_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -596,12 +587,12 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire
     ret i8 %new
 }
 
-define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acquire_acquire_i8_global(
+define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_acquire_i8_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -609,8 +600,8 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_global_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_global_cta_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -621,7 +612,7 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [acquire_acquire_i8_global_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [monotonic_acquire_i8_global_cta_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
@@ -639,15 +630,15 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB13_1;
 ; SM60-NEXT:  $L__BB13_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire
     ret i8 %new
 }
 
-define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acquire_acquire_i8_shared(
+define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_acquire_i8_global_gpu(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -655,8 +646,8 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_shared_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_global_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_global_gpu_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -667,15 +658,15 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [acquire_acquire_i8_shared_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [monotonic_acquire_i8_global_gpu_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB14_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM60-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB14_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -685,15 +676,15 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB14_1;
 ; SM60-NEXT:  $L__BB14_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    membar.gl;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire
     ret i8 %new
 }
 
-define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acquire_seq_cst_i8_generic(
+define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_acquire_i8_shared_sys(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -701,9 +692,8 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_param_0];
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_shared_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_shared_sys_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -714,15 +704,15 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [acquire_seq_cst_i8_generic_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [monotonic_acquire_i8_shared_sys_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.b32 %r16, [%rd1];
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB15_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM60-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB15_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -735,12 +725,12 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire
     ret i8 %new
 }
 
-define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acquire_seq_cst_i8_global(
+define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_acquire_i8_shared_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -748,9 +738,8 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_global_param_0];
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_shared_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_shared_cta_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -761,15 +750,15 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [acquire_seq_cst_i8_global_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [monotonic_acquire_i8_shared_cta_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB16_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM60-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB16_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -779,15 +768,15 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB16_1;
 ; SM60-NEXT:  $L__BB16_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire
     ret i8 %new
 }
 
-define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acquire_seq_cst_i8_shared(
+define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_acquire_i8_shared_gpu(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -795,9 +784,8 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_param_0];
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_shared_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_shared_gpu_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -808,7 +796,7 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [acquire_seq_cst_i8_shared_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [monotonic_acquire_i8_shared_gpu_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
@@ -826,15 +814,15 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB17_1;
 ; SM60-NEXT:  $L__BB17_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    membar.gl;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire
     ret i8 %new
 }
 
-define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: release_monotonic_i8_generic(
+define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_seq_cst_i8_generic_sys(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -842,8 +830,8 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_generic_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_sys_param_0];
 ; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -855,7 +843,7 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [release_monotonic_i8_generic_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_sys_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM60-NEXT:    ld.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
@@ -873,14 +861,15 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB18_1;
 ; SM60-NEXT:  $L__BB18_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst
     ret i8 %new
 }
 
-define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: release_monotonic_i8_global(
+define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_seq_cst_i8_generic_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -888,9 +877,9 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_global_param_0];
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_cta_param_0];
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -901,15 +890,15 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [release_monotonic_i8_global_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_cta_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB19_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM60-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB19_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -919,14 +908,15 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB19_1;
 ; SM60-NEXT:  $L__BB19_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst
     ret i8 %new
 }
 
-define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: release_monotonic_i8_shared(
+define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_seq_cst_i8_generic_gpu(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -934,9 +924,9 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_shared_param_0];
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_gpu_param_0];
+; SM60-NEXT:    membar.gl;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -947,15 +937,15 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [release_monotonic_i8_shared_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_gpu_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB20_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM60-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB20_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -965,14 +955,15 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB20_1;
 ; SM60-NEXT:  $L__BB20_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst
     ret i8 %new
 }
 
-define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: release_acquire_i8_generic(
+define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_seq_cst_i8_global_sys(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -980,8 +971,8 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_generic_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_sys_param_0];
 ; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -993,15 +984,15 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [release_acquire_i8_generic_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [monotonic_seq_cst_i8_global_sys_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.b32 %r16, [%rd1];
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB21_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM60-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB21_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1014,12 +1005,12 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst
     ret i8 %new
 }
 
-define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: release_acquire_i8_global(
+define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_seq_cst_i8_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -1027,9 +1018,9 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_global_param_0];
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -1040,7 +1031,7 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [release_acquire_i8_global_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [monotonic_seq_cst_i8_global_cta_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
@@ -1058,15 +1049,15 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB22_1;
 ; SM60-NEXT:  $L__BB22_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst
     ret i8 %new
 }
 
-define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: release_acquire_i8_shared(
+define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_seq_cst_i8_global_gpu(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -1074,9 +1065,9 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_shared_param_0];
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_gpu_param_0];
+; SM60-NEXT:    membar.gl;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -1087,15 +1078,15 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [release_acquire_i8_shared_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [monotonic_seq_cst_i8_global_gpu_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB23_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM60-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB23_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1105,15 +1096,15 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB23_1;
 ; SM60-NEXT:  $L__BB23_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    membar.gl;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst
     ret i8 %new
 }
 
-define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: release_seq_cst_i8_generic(
+define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_seq_cst_i8_shared_sys(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -1121,8 +1112,8 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_generic_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_sys_param_0];
 ; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1134,15 +1125,15 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [release_seq_cst_i8_generic_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_sys_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.b32 %r16, [%rd1];
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB24_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM60-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB24_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1155,12 +1146,12 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst
     ret i8 %new
 }
 
-define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: release_seq_cst_i8_global(
+define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_seq_cst_i8_shared_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -1168,9 +1159,9 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_global_param_0];
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_cta_param_0];
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -1181,15 +1172,15 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [release_seq_cst_i8_global_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_cta_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB25_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM60-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB25_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1199,15 +1190,15 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB25_1;
 ; SM60-NEXT:  $L__BB25_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst
     ret i8 %new
 }
 
-define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: release_seq_cst_i8_shared(
+define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_seq_cst_i8_shared_gpu(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -1215,9 +1206,9 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_shared_param_0];
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_gpu_param_0];
+; SM60-NEXT:    membar.gl;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -1228,7 +1219,7 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [release_seq_cst_i8_shared_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_gpu_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
@@ -1246,15 +1237,15 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB26_1;
 ; SM60-NEXT:  $L__BB26_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    membar.gl;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst
     ret i8 %new
 }
 
-define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acq_rel_monotonic_i8_generic(
+define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_monotonic_i8_generic_sys(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -1262,9 +1253,8 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_param_0];
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_generic_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_generic_sys_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -1275,7 +1265,7 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [acquire_monotonic_i8_generic_sys_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM60-NEXT:    ld.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
@@ -1296,12 +1286,12 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic
     ret i8 %new
 }
 
-define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acq_rel_monotonic_i8_global(
+define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_monotonic_i8_generic_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -1309,9 +1299,8 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_param_0];
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_generic_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_generic_cta_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -1322,15 +1311,15 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_monotonic_i8_global_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [acquire_monotonic_i8_generic_cta_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB28_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM60-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB28_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1340,15 +1329,15 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB28_1;
 ; SM60-NEXT:  $L__BB28_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic
     ret i8 %new
 }
 
-define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acq_rel_monotonic_i8_shared(
+define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_monotonic_i8_generic_gpu(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -1356,9 +1345,8 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_param_0];
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_generic_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_generic_gpu_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -1369,15 +1357,15 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [acquire_monotonic_i8_generic_gpu_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB29_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM60-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB29_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1387,15 +1375,15 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB29_1;
 ; SM60-NEXT:  $L__BB29_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    membar.gl;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic
     ret i8 %new
 }
 
-define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acq_rel_acquire_i8_generic(
+define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_monotonic_i8_global_sys(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -1403,9 +1391,8 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_param_0];
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_global_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_global_sys_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -1416,15 +1403,15 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_acquire_i8_generic_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [acquire_monotonic_i8_global_sys_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.b32 %r16, [%rd1];
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB30_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM60-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB30_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1437,12 +1424,12 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic
     ret i8 %new
 }
 
-define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acq_rel_acquire_i8_global(
+define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_monotonic_i8_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -1450,9 +1437,8 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0];
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_global_cta_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -1463,7 +1449,7 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_acquire_i8_global_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [acquire_monotonic_i8_global_cta_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
@@ -1481,15 +1467,15 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB31_1;
 ; SM60-NEXT:  $L__BB31_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic
     ret i8 %new
 }
 
-define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acq_rel_acquire_i8_shared(
+define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_monotonic_i8_global_gpu(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -1497,9 +1483,8 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_param_0];
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_global_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_global_gpu_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -1510,15 +1495,15 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_acquire_i8_shared_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [acquire_monotonic_i8_global_gpu_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB32_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM60-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB32_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1528,15 +1513,15 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB32_1;
 ; SM60-NEXT:  $L__BB32_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    membar.gl;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic
     ret i8 %new
 }
 
-define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acq_rel_seq_cst_i8_generic(
+define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_monotonic_i8_shared_sys(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -1544,9 +1529,8 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_param_0];
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_shared_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_shared_sys_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -1557,15 +1541,15 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [acquire_monotonic_i8_shared_sys_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.b32 %r16, [%rd1];
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB33_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM60-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB33_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1578,12 +1562,12 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic
     ret i8 %new
 }
 
-define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acq_rel_seq_cst_i8_global(
+define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_monotonic_i8_shared_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -1591,9 +1575,8 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_param_0];
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_shared_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_shared_cta_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -1604,15 +1587,15 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [acquire_monotonic_i8_shared_cta_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB34_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM60-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB34_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1622,15 +1605,15 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB34_1;
 ; SM60-NEXT:  $L__BB34_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic
     ret i8 %new
 }
 
-define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acq_rel_seq_cst_i8_shared(
+define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_monotonic_i8_shared_gpu(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -1638,9 +1621,8 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_param_0];
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_shared_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_shared_gpu_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -1651,7 +1633,7 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [acquire_monotonic_i8_shared_gpu_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
@@ -1669,15 +1651,15 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB35_1;
 ; SM60-NEXT:  $L__BB35_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    membar.gl;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic
     ret i8 %new
 }
 
-define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: seq_cst_monotonic_i8_generic(
+define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_acquire_i8_generic_sys(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -1685,9 +1667,8 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_param_0];
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_generic_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_generic_sys_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -1698,7 +1679,7 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [acquire_acquire_i8_generic_sys_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM60-NEXT:    ld.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
@@ -1719,12 +1700,12 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire acquire
     ret i8 %new
 }
 
-define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: seq_cst_monotonic_i8_global(
+define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_acquire_i8_generic_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -1732,9 +1713,8 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_param_0];
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_generic_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_generic_cta_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -1745,15 +1725,15 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_monotonic_i8_global_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [acquire_acquire_i8_generic_cta_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB37_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM60-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB37_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1763,15 +1743,15 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB37_1;
 ; SM60-NEXT:  $L__BB37_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire
     ret i8 %new
 }
 
-define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: seq_cst_monotonic_i8_shared(
+define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_acquire_i8_generic_gpu(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -1779,9 +1759,8 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_param_0];
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_generic_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_generic_gpu_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -1792,15 +1771,15 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [acquire_acquire_i8_generic_gpu_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB38_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM60-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB38_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1810,15 +1789,15 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB38_1;
 ; SM60-NEXT:  $L__BB38_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    membar.gl;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire
     ret i8 %new
 }
 
-define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: seq_cst_acquire_i8_generic(
+define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_acquire_i8_global_sys(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -1826,9 +1805,8 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_param_0];
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_global_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_global_sys_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -1839,15 +1817,15 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_acquire_i8_generic_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [acquire_acquire_i8_global_sys_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.b32 %r16, [%rd1];
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB39_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM60-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB39_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1860,12 +1838,12 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire
     ret i8 %new
 }
 
-define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: seq_cst_acquire_i8_global(
+define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_acquire_i8_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -1873,9 +1851,8 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_global_param_0];
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_global_cta_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -1886,7 +1863,7 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_acquire_i8_global_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [acquire_acquire_i8_global_cta_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
@@ -1904,15 +1881,15 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB40_1;
 ; SM60-NEXT:  $L__BB40_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire
     ret i8 %new
 }
 
-define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: seq_cst_acquire_i8_shared(
+define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_acquire_i8_global_gpu(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -1920,9 +1897,8 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_param_0];
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_global_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_global_gpu_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -1933,15 +1909,15 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_acquire_i8_shared_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [acquire_acquire_i8_global_gpu_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB41_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM60-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB41_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1951,15 +1927,15 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB41_1;
 ; SM60-NEXT:  $L__BB41_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    membar.gl;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire
     ret i8 %new
 }
 
-define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: seq_cst_seq_cst_i8_generic(
+define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_acquire_i8_shared_sys(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -1967,9 +1943,8 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_param_0];
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_shared_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_shared_sys_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -1980,15 +1955,15 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [acquire_acquire_i8_shared_sys_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.b32 %r16, [%rd1];
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB42_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM60-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB42_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -2001,12 +1976,12 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire
     ret i8 %new
 }
 
-define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: seq_cst_seq_cst_i8_global(
+define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_acquire_i8_shared_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -2014,9 +1989,8 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_param_0];
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_shared_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_shared_cta_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -2027,15 +2001,15 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [acquire_acquire_i8_shared_cta_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB43_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM60-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB43_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -2045,15 +2019,15 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB43_1;
 ; SM60-NEXT:  $L__BB43_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire
     ret i8 %new
 }
 
-define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: seq_cst_seq_cst_i8_shared(
+define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_acquire_i8_shared_gpu(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -2061,9 +2035,8 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_param_0];
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_shared_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_shared_gpu_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -2074,7 +2047,7 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [acquire_acquire_i8_shared_gpu_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
@@ -2092,3589 +2065,14968 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB44_1;
 ; SM60-NEXT:  $L__BB44_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    membar.gl;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire
     ret i8 %new
 }
 
-define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: monotonic_monotonic_i16_generic(
+define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_seq_cst_i8_generic_sys(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<21>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_sys_param_0];
+; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_generic_param_1];
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    ld.param.b8 %r15, [acquire_seq_cst_i8_generic_sys_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB45_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB45_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB45_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB45_1;
 ; SM60-NEXT:  $L__BB45_3: // %partword.cmpxchg.end
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic
-    ret i16 %new
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst
+    ret i8 %new
 }
 
-define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: monotonic_monotonic_i16_global(
+define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_seq_cst_i8_generic_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<21>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_global_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_cta_param_0];
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_global_param_1];
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    ld.param.b8 %r15, [acquire_seq_cst_i8_generic_cta_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB46_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB46_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB46_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB46_1;
 ; SM60-NEXT:  $L__BB46_3: // %partword.cmpxchg.end
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic monotonic
-    ret i16 %new
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst
+    ret i8 %new
 }
 
-define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: monotonic_monotonic_i16_shared(
+define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_seq_cst_i8_generic_gpu(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<21>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_gpu_param_0];
+; SM60-NEXT:    membar.gl;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_shared_param_1];
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    ld.param.b8 %r15, [acquire_seq_cst_i8_generic_gpu_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB47_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB47_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB47_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB47_1;
 ; SM60-NEXT:  $L__BB47_3: // %partword.cmpxchg.end
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic monotonic
-    ret i16 %new
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst
+    ret i8 %new
 }
 
-define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: monotonic_acquire_i16_generic(
+define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_seq_cst_i8_global_sys(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<21>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_generic_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_global_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_global_sys_param_0];
+; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_generic_param_1];
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    ld.param.b8 %r15, [acquire_seq_cst_i8_global_sys_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB48_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB48_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB48_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB48_1;
 ; SM60-NEXT:  $L__BB48_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic acquire
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst
+    ret i8 %new
 }
 
-define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: monotonic_acquire_i16_global(
+define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_seq_cst_i8_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<21>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_global_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_global_param_1];
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    ld.param.b8 %r15, [acquire_seq_cst_i8_global_cta_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB49_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB49_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB49_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB49_1;
 ; SM60-NEXT:  $L__BB49_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic acquire
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst
+    ret i8 %new
 }
 
-define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: monotonic_acquire_i16_shared(
+define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_seq_cst_i8_global_gpu(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<21>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_shared_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_global_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_global_gpu_param_0];
+; SM60-NEXT:    membar.gl;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_shared_param_1];
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    ld.param.b8 %r15, [acquire_seq_cst_i8_global_gpu_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB50_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB50_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB50_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB50_1;
 ; SM60-NEXT:  $L__BB50_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic acquire
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst
+    ret i8 %new
 }
 
-define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: monotonic_seq_cst_i16_generic(
+define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_seq_cst_i8_shared_sys(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<21>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_sys_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    ld.param.b8 %r15, [acquire_seq_cst_i8_shared_sys_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB51_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB51_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB51_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB51_1;
 ; SM60-NEXT:  $L__BB51_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic seq_cst
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst
+    ret i8 %new
 }
 
-define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: monotonic_seq_cst_i16_global(
+define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_seq_cst_i8_shared_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<21>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_global_param_1];
+; SM60-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_cta_param_0];
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    ld.param.b8 %r15, [acquire_seq_cst_i8_shared_cta_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB52_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB52_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB52_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB52_1;
 ; SM60-NEXT:  $L__BB52_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic seq_cst
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst
+    ret i8 %new
 }
 
-define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: monotonic_seq_cst_i16_shared(
+define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_seq_cst_i8_shared_gpu(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<21>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_param_1];
+; SM60-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_gpu_param_0];
+; SM60-NEXT:    membar.gl;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.b8 %r15, [acquire_seq_cst_i8_shared_gpu_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB53_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB53_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB53_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB53_1;
 ; SM60-NEXT:  $L__BB53_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic seq_cst
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst
+    ret i8 %new
 }
 
-define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acquire_monotonic_i16_generic(
+define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_monotonic_i8_generic_sys(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<21>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_generic_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_generic_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_generic_sys_param_0];
+; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_generic_param_1];
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    ld.param.b8 %r15, [release_monotonic_i8_generic_sys_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB54_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB54_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB54_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB54_1;
 ; SM60-NEXT:  $L__BB54_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire monotonic
-    ret i16 %new
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release monotonic
+    ret i8 %new
 }
 
-define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acquire_monotonic_i16_global(
+define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_monotonic_i8_generic_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<21>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_global_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_generic_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_generic_cta_param_0];
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_global_param_1];
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    ld.param.b8 %r15, [release_monotonic_i8_generic_cta_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB55_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB55_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB55_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB55_1;
 ; SM60-NEXT:  $L__BB55_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire monotonic
-    ret i16 %new
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release monotonic
+    ret i8 %new
 }
 
-define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acquire_monotonic_i16_shared(
+define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_monotonic_i8_generic_gpu(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<21>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_shared_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_generic_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_generic_gpu_param_0];
+; SM60-NEXT:    membar.gl;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_shared_param_1];
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    ld.param.b8 %r15, [release_monotonic_i8_generic_gpu_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB56_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB56_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB56_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB56_1;
 ; SM60-NEXT:  $L__BB56_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire monotonic
-    ret i16 %new
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release monotonic
+    ret i8 %new
 }
 
-define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acquire_acquire_i16_generic(
+define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_monotonic_i8_global_sys(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<21>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_generic_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_global_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_global_sys_param_0];
+; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_generic_param_1];
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    ld.param.b8 %r15, [release_monotonic_i8_global_sys_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB57_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB57_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB57_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB57_1;
 ; SM60-NEXT:  $L__BB57_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire acquire
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release monotonic
+    ret i8 %new
 }
 
-define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acquire_acquire_i16_global(
+define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_monotonic_i8_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<21>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_global_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_global_param_1];
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    ld.param.b8 %r15, [release_monotonic_i8_global_cta_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB58_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB58_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB58_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB58_1;
 ; SM60-NEXT:  $L__BB58_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire acquire
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic
+    ret i8 %new
 }
 
-define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acquire_acquire_i16_shared(
+define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_monotonic_i8_global_gpu(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<21>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_shared_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_global_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_global_gpu_param_0];
+; SM60-NEXT:    membar.gl;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_shared_param_1];
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    ld.param.b8 %r15, [release_monotonic_i8_global_gpu_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB59_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB59_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB59_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB59_1;
 ; SM60-NEXT:  $L__BB59_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire acquire
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic
+    ret i8 %new
 }
 
-define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acquire_seq_cst_i16_generic(
+define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_monotonic_i8_shared_sys(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<21>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_shared_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_shared_sys_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_generic_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    ld.param.b8 %r15, [release_monotonic_i8_shared_sys_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB60_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB60_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB60_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB60_1;
 ; SM60-NEXT:  $L__BB60_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire seq_cst
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release monotonic
+    ret i8 %new
 }
 
-define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acquire_seq_cst_i16_global(
+define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_monotonic_i8_shared_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<21>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_global_param_1];
+; SM60-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_shared_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_shared_cta_param_0];
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    ld.param.b8 %r15, [release_monotonic_i8_shared_cta_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB61_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB61_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB61_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB61_1;
 ; SM60-NEXT:  $L__BB61_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire seq_cst
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic
+    ret i8 %new
 }
 
-define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acquire_seq_cst_i16_shared(
+define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_monotonic_i8_shared_gpu(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<21>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_shared_param_1];
+; SM60-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_shared_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_shared_gpu_param_0];
+; SM60-NEXT:    membar.gl;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    ld.param.b8 %r15, [release_monotonic_i8_shared_gpu_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB62_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB62_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB62_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB62_1;
 ; SM60-NEXT:  $L__BB62_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire seq_cst
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic
+    ret i8 %new
 }
 
-define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: release_monotonic_i16_generic(
+define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_acquire_i8_generic_sys(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<21>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_generic_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_generic_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_generic_sys_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_generic_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    ld.param.b8 %r15, [release_acquire_i8_generic_sys_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB63_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB63_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB63_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB63_1;
 ; SM60-NEXT:  $L__BB63_3: // %partword.cmpxchg.end
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release monotonic
-    ret i16 %new
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release acquire
+    ret i8 %new
 }
 
-define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: release_monotonic_i16_global(
+define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_acquire_i8_generic_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<21>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_global_param_1];
+; SM60-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_generic_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_generic_cta_param_0];
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    ld.param.b8 %r15, [release_acquire_i8_generic_cta_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB64_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB64_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB64_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB64_1;
 ; SM60-NEXT:  $L__BB64_3: // %partword.cmpxchg.end
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release monotonic
-    ret i16 %new
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release acquire
+    ret i8 %new
 }
 
-define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: release_monotonic_i16_shared(
+define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_acquire_i8_generic_gpu(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<21>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_shared_param_1];
+; SM60-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_generic_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_generic_gpu_param_0];
+; SM60-NEXT:    membar.gl;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    ld.param.b8 %r15, [release_acquire_i8_generic_gpu_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB65_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB65_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB65_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB65_1;
 ; SM60-NEXT:  $L__BB65_3: // %partword.cmpxchg.end
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release monotonic
-    ret i16 %new
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release acquire
+    ret i8 %new
 }
 
-define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: release_acquire_i16_generic(
+define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_acquire_i8_global_sys(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<21>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_generic_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_global_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_global_sys_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [release_acquire_i16_generic_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    ld.param.b8 %r15, [release_acquire_i8_global_sys_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB66_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB66_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB66_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB66_1;
 ; SM60-NEXT:  $L__BB66_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release acquire
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release acquire
+    ret i8 %new
 }
 
-define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: release_acquire_i16_global(
+define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_acquire_i8_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<21>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [release_acquire_i16_global_param_1];
+; SM60-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    ld.param.b8 %r15, [release_acquire_i8_global_cta_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB67_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB67_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB67_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB67_1;
 ; SM60-NEXT:  $L__BB67_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release acquire
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire
+    ret i8 %new
 }
 
-define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: release_acquire_i16_shared(
+define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_acquire_i8_global_gpu(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<21>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [release_acquire_i16_shared_param_1];
+; SM60-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_global_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_global_gpu_param_0];
+; SM60-NEXT:    membar.gl;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    ld.param.b8 %r15, [release_acquire_i8_global_gpu_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB68_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB68_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB68_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB68_1;
 ; SM60-NEXT:  $L__BB68_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release acquire
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release acquire
+    ret i8 %new
 }
 
-define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: release_seq_cst_i16_generic(
+define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_acquire_i8_shared_sys(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<21>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_generic_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_shared_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_shared_sys_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_generic_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    ld.param.b8 %r15, [release_acquire_i8_shared_sys_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB69_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB69_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB69_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB69_1;
 ; SM60-NEXT:  $L__BB69_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release seq_cst
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release acquire
+    ret i8 %new
 }
 
-define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: release_seq_cst_i16_global(
+define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_acquire_i8_shared_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<21>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_global_param_1];
+; SM60-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_shared_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_shared_cta_param_0];
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    ld.param.b8 %r15, [release_acquire_i8_shared_cta_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB70_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB70_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB70_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB70_1;
 ; SM60-NEXT:  $L__BB70_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release seq_cst
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release acquire
+    ret i8 %new
 }
 
-define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: release_seq_cst_i16_shared(
+define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_acquire_i8_shared_gpu(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<21>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_shared_param_1];
+; SM60-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_shared_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_shared_gpu_param_0];
+; SM60-NEXT:    membar.gl;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    ld.param.b8 %r15, [release_acquire_i8_shared_gpu_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB71_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB71_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB71_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB71_1;
 ; SM60-NEXT:  $L__BB71_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release seq_cst
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release acquire
+    ret i8 %new
 }
 
-define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acq_rel_monotonic_i16_generic(
+define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_seq_cst_i8_generic_sys(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<21>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_generic_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_generic_sys_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.b8 %r15, [release_seq_cst_i8_generic_sys_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB72_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB72_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB72_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB72_1;
 ; SM60-NEXT:  $L__BB72_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel monotonic
-    ret i16 %new
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release seq_cst
+    ret i8 %new
 }
 
-define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acq_rel_monotonic_i16_global(
+define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_seq_cst_i8_generic_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<21>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_global_param_1];
+; SM60-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_generic_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_generic_cta_param_0];
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    ld.param.b8 %r15, [release_seq_cst_i8_generic_cta_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB73_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB73_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB73_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB73_1;
 ; SM60-NEXT:  $L__BB73_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel monotonic
-    ret i16 %new
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst
+    ret i8 %new
 }
 
-define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acq_rel_monotonic_i16_shared(
+define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_seq_cst_i8_generic_gpu(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<21>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_param_1];
+; SM60-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_generic_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_generic_gpu_param_0];
+; SM60-NEXT:    membar.gl;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    ld.param.b8 %r15, [release_seq_cst_i8_generic_gpu_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB74_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB74_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB74_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB74_1;
 ; SM60-NEXT:  $L__BB74_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel monotonic
-    ret i16 %new
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst
+    ret i8 %new
 }
 
-define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acq_rel_acquire_i16_generic(
+define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_seq_cst_i8_global_sys(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<21>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_global_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_global_sys_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_generic_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    ld.param.b8 %r15, [release_seq_cst_i8_global_sys_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB75_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB75_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB75_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB75_1;
 ; SM60-NEXT:  $L__BB75_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel acquire
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst
+    ret i8 %new
 }
 
-define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acq_rel_acquire_i16_global(
+define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_seq_cst_i8_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<21>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_global_param_1];
+; SM60-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    ld.param.b8 %r15, [release_seq_cst_i8_global_cta_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB76_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB76_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB76_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB76_1;
 ; SM60-NEXT:  $L__BB76_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel acquire
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst
+    ret i8 %new
 }
 
-define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acq_rel_acquire_i16_shared(
+define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_seq_cst_i8_global_gpu(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<21>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_shared_param_1];
+; SM60-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_global_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_global_gpu_param_0];
+; SM60-NEXT:    membar.gl;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    ld.param.b8 %r15, [release_seq_cst_i8_global_gpu_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB77_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB77_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB77_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB77_1;
 ; SM60-NEXT:  $L__BB77_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel acquire
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst
+    ret i8 %new
 }
 
-define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acq_rel_seq_cst_i16_generic(
+define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_seq_cst_i8_shared_sys(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<21>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_shared_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_shared_sys_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    ld.param.b8 %r15, [release_seq_cst_i8_shared_sys_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB78_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB78_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB78_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB78_1;
 ; SM60-NEXT:  $L__BB78_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel seq_cst
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst
+    ret i8 %new
 }
 
-define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acq_rel_seq_cst_i16_global(
+define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_seq_cst_i8_shared_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<21>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_param_1];
+; SM60-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_shared_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_shared_cta_param_0];
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    ld.param.b8 %r15, [release_seq_cst_i8_shared_cta_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB79_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB79_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB79_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB79_1;
 ; SM60-NEXT:  $L__BB79_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel seq_cst
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst
+    ret i8 %new
 }
 
-define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acq_rel_seq_cst_i16_shared(
+define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_seq_cst_i8_shared_gpu(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<21>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_param_1];
+; SM60-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_shared_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_shared_gpu_param_0];
+; SM60-NEXT:    membar.gl;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    ld.param.b8 %r15, [release_seq_cst_i8_shared_gpu_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB80_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB80_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB80_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB80_1;
 ; SM60-NEXT:  $L__BB80_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel seq_cst
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst
+    ret i8 %new
 }
 
-define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: seq_cst_monotonic_i16_generic(
+define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_monotonic_i8_generic_sys(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<21>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_sys_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_sys_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB81_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB81_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB81_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB81_1;
 ; SM60-NEXT:  $L__BB81_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst monotonic
-    ret i16 %new
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic
+    ret i8 %new
 }
 
-define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: seq_cst_monotonic_i16_global(
+define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_monotonic_i8_generic_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<21>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_global_param_1];
+; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_cta_param_0];
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_cta_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB82_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB82_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB82_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB82_1;
 ; SM60-NEXT:  $L__BB82_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst monotonic
-    ret i16 %new
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic
+    ret i8 %new
 }
 
-define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: seq_cst_monotonic_i16_shared(
+define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_monotonic_i8_generic_gpu(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<21>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_param_1];
+; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_gpu_param_0];
+; SM60-NEXT:    membar.gl;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_gpu_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB83_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB83_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB83_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB83_1;
 ; SM60-NEXT:  $L__BB83_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst monotonic
-    ret i16 %new
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic
+    ret i8 %new
 }
 
-define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: seq_cst_acquire_i16_generic(
+define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_monotonic_i8_global_sys(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<21>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_sys_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_generic_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_monotonic_i8_global_sys_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB84_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB84_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB84_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB84_1;
 ; SM60-NEXT:  $L__BB84_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst acquire
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic
+    ret i8 %new
 }
 
-define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: seq_cst_acquire_i16_global(
+define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_monotonic_i8_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<21>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_global_param_1];
+; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_monotonic_i8_global_cta_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB85_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB85_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB85_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB85_1;
 ; SM60-NEXT:  $L__BB85_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst acquire
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic
+    ret i8 %new
 }
 
-define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: seq_cst_acquire_i16_shared(
+define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_monotonic_i8_global_gpu(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<21>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_shared_param_1];
+; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_gpu_param_0];
+; SM60-NEXT:    membar.gl;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_monotonic_i8_global_gpu_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB86_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB86_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB86_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB86_1;
 ; SM60-NEXT:  $L__BB86_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst acquire
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic
+    ret i8 %new
 }
 
-define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: seq_cst_seq_cst_i16_generic(
+define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_monotonic_i8_shared_sys(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<21>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_sys_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_sys_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB87_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB87_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB87_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB87_1;
 ; SM60-NEXT:  $L__BB87_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic
+    ret i8 %new
 }
 
-define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: seq_cst_seq_cst_i16_global(
+define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_monotonic_i8_shared_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<21>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_param_1];
+; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_cta_param_0];
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_cta_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB88_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB88_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB88_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB88_1;
 ; SM60-NEXT:  $L__BB88_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst seq_cst
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic
+    ret i8 %new
 }
 
-define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: seq_cst_seq_cst_i16_shared(
+define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_monotonic_i8_shared_gpu(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<21>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_param_1];
+; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_gpu_param_0];
+; SM60-NEXT:    membar.gl;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_gpu_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB89_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB89_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB89_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB89_1;
 ; SM60-NEXT:  $L__BB89_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic
+    ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_acquire_i8_generic_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_sys_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_acquire_i8_generic_sys_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB90_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB90_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB90_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB90_1;
+; SM60-NEXT:  $L__BB90_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire
+    ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_acquire_i8_generic_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_acquire_i8_generic_cta_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB91_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB91_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB91_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB91_1;
+; SM60-NEXT:  $L__BB91_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
+    ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_acquire_i8_generic_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_acquire_i8_generic_gpu_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB92_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB92_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB92_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB92_1;
+; SM60-NEXT:  $L__BB92_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire
+    ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_acquire_i8_global_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_global_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_global_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_acquire_i8_global_sys_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB93_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB93_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB93_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB93_1;
+; SM60-NEXT:  $L__BB93_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire
+    ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_acquire_i8_global_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_acquire_i8_global_cta_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB94_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB94_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB94_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB94_1;
+; SM60-NEXT:  $L__BB94_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
+    ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_acquire_i8_global_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_global_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_global_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_acquire_i8_global_gpu_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB95_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB95_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB95_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB95_1;
+; SM60-NEXT:  $L__BB95_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire
+    ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_acquire_i8_shared_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_acquire_i8_shared_sys_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB96_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB96_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB96_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB96_1;
+; SM60-NEXT:  $L__BB96_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire
+    ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_acquire_i8_shared_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_acquire_i8_shared_cta_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB97_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB97_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB97_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB97_1;
+; SM60-NEXT:  $L__BB97_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
+    ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_acquire_i8_shared_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_acquire_i8_shared_gpu_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB98_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB98_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB98_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB98_1;
+; SM60-NEXT:  $L__BB98_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire
+    ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i8_generic_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_sys_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB99_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB99_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB99_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB99_1;
+; SM60-NEXT:  $L__BB99_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst
+    ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i8_generic_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_cta_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB100_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB100_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB100_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB100_1;
+; SM60-NEXT:  $L__BB100_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst
+    ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i8_generic_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_gpu_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB101_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB101_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB101_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB101_1;
+; SM60-NEXT:  $L__BB101_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst
+    ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i8_global_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_sys_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB102_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB102_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB102_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB102_1;
+; SM60-NEXT:  $L__BB102_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst
+    ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i8_global_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_cta_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB103_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB103_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB103_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB103_1;
+; SM60-NEXT:  $L__BB103_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst
+    ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i8_global_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_gpu_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB104_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB104_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB104_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB104_1;
+; SM60-NEXT:  $L__BB104_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst
+    ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i8_shared_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_sys_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB105_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB105_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB105_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB105_1;
+; SM60-NEXT:  $L__BB105_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst
+    ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i8_shared_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_cta_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB106_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB106_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB106_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB106_1;
+; SM60-NEXT:  $L__BB106_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst
+    ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i8_shared_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_gpu_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB107_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB107_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB107_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB107_1;
+; SM60-NEXT:  $L__BB107_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst
+    ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_monotonic_i8_generic_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_sys_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB108_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB108_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB108_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB108_1;
+; SM60-NEXT:  $L__BB108_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic
+    ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_monotonic_i8_generic_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_cta_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB109_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB109_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB109_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB109_1;
+; SM60-NEXT:  $L__BB109_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic
+    ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_monotonic_i8_generic_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_gpu_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB110_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB110_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB110_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB110_1;
+; SM60-NEXT:  $L__BB110_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic
+    ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_monotonic_i8_global_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_monotonic_i8_global_sys_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB111_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB111_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB111_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB111_1;
+; SM60-NEXT:  $L__BB111_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic
+    ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_monotonic_i8_global_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_monotonic_i8_global_cta_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB112_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB112_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB112_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB112_1;
+; SM60-NEXT:  $L__BB112_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic
+    ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_monotonic_i8_global_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_monotonic_i8_global_gpu_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB113_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB113_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB113_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB113_1;
+; SM60-NEXT:  $L__BB113_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic
+    ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_monotonic_i8_shared_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_sys_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB114_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB114_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB114_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB114_1;
+; SM60-NEXT:  $L__BB114_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic
+    ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_monotonic_i8_shared_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_cta_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB115_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB115_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB115_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB115_1;
+; SM60-NEXT:  $L__BB115_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic
+    ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_monotonic_i8_shared_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_gpu_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB116_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB116_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB116_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB116_1;
+; SM60-NEXT:  $L__BB116_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic
+    ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_acquire_i8_generic_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_acquire_i8_generic_sys_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB117_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB117_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB117_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB117_1;
+; SM60-NEXT:  $L__BB117_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire
+    ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_acquire_i8_generic_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_acquire_i8_generic_cta_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB118_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB118_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB118_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB118_1;
+; SM60-NEXT:  $L__BB118_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire
+    ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_acquire_i8_generic_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_acquire_i8_generic_gpu_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB119_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB119_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB119_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB119_1;
+; SM60-NEXT:  $L__BB119_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire
+    ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_acquire_i8_global_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_global_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_global_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_acquire_i8_global_sys_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB120_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB120_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB120_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB120_1;
+; SM60-NEXT:  $L__BB120_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire
+    ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_acquire_i8_global_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_acquire_i8_global_cta_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB121_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB121_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB121_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB121_1;
+; SM60-NEXT:  $L__BB121_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire
+    ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_acquire_i8_global_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_global_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_global_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_acquire_i8_global_gpu_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB122_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB122_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB122_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB122_1;
+; SM60-NEXT:  $L__BB122_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire
+    ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_acquire_i8_shared_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_acquire_i8_shared_sys_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB123_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB123_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB123_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB123_1;
+; SM60-NEXT:  $L__BB123_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire
+    ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_acquire_i8_shared_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_acquire_i8_shared_cta_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB124_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB124_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB124_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB124_1;
+; SM60-NEXT:  $L__BB124_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire
+    ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_acquire_i8_shared_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_acquire_i8_shared_gpu_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB125_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB125_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB125_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB125_1;
+; SM60-NEXT:  $L__BB125_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire
+    ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i8_generic_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_sys_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB126_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB126_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB126_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB126_1;
+; SM60-NEXT:  $L__BB126_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst
+    ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i8_generic_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_cta_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB127_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB127_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB127_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB127_1;
+; SM60-NEXT:  $L__BB127_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst
+    ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i8_generic_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_gpu_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB128_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB128_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB128_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB128_1;
+; SM60-NEXT:  $L__BB128_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst
+    ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i8_global_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_sys_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB129_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB129_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB129_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB129_1;
+; SM60-NEXT:  $L__BB129_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst
+    ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i8_global_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_cta_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB130_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB130_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB130_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB130_1;
+; SM60-NEXT:  $L__BB130_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst
+    ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i8_global_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_gpu_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB131_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB131_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB131_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB131_1;
+; SM60-NEXT:  $L__BB131_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst
+    ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i8_shared_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_sys_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB132_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB132_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB132_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB132_1;
+; SM60-NEXT:  $L__BB132_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst
+    ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i8_shared_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_cta_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB133_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB133_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB133_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB133_1;
+; SM60-NEXT:  $L__BB133_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst
+    ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i8_shared_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_gpu_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB134_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB134_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB134_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB134_1;
+; SM60-NEXT:  $L__BB134_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst
+    ret i8 %new
+}
+
+define i16 @monotonic_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_monotonic_i16_generic_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_sys_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_generic_sys_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB135_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB135_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB135_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB135_1;
+; SM60-NEXT:  $L__BB135_3: // %partword.cmpxchg.end
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic
+    ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_monotonic_i16_generic_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_cta_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_generic_cta_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB136_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB136_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB136_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB136_1;
+; SM60-NEXT:  $L__BB136_3: // %partword.cmpxchg.end
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic
+    ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_monotonic_i16_generic_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_gpu_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_generic_gpu_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB137_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB137_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB137_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB137_1;
+; SM60-NEXT:  $L__BB137_3: // %partword.cmpxchg.end
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic
+    ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_monotonic_i16_global_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_global_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_global_sys_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_global_sys_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB138_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB138_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB138_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB138_1;
+; SM60-NEXT:  $L__BB138_3: // %partword.cmpxchg.end
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic
+    ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_monotonic_i16_global_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_global_cta_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_global_cta_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB139_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB139_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB139_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB139_1;
+; SM60-NEXT:  $L__BB139_3: // %partword.cmpxchg.end
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic
+    ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_monotonic_i16_global_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_global_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_global_gpu_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_global_gpu_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB140_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB140_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB140_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB140_1;
+; SM60-NEXT:  $L__BB140_3: // %partword.cmpxchg.end
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic
+    ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_monotonic_i16_shared_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_sys_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_shared_sys_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB141_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB141_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB141_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB141_1;
+; SM60-NEXT:  $L__BB141_3: // %partword.cmpxchg.end
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic
+    ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_monotonic_i16_shared_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_cta_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_shared_cta_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB142_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB142_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB142_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB142_1;
+; SM60-NEXT:  $L__BB142_3: // %partword.cmpxchg.end
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic
+    ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_monotonic_i16_shared_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_gpu_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_shared_gpu_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB143_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB143_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB143_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB143_1;
+; SM60-NEXT:  $L__BB143_3: // %partword.cmpxchg.end
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic
+    ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_acquire_i16_generic_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_generic_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_generic_sys_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_generic_sys_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB144_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB144_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB144_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB144_1;
+; SM60-NEXT:  $L__BB144_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire
+    ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_acquire_i16_generic_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_generic_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_generic_cta_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_generic_cta_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB145_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB145_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB145_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB145_1;
+; SM60-NEXT:  $L__BB145_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire
+    ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_acquire_i16_generic_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_generic_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_generic_gpu_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_generic_gpu_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB146_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB146_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB146_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB146_1;
+; SM60-NEXT:  $L__BB146_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire
+    ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_acquire_i16_global_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_global_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_global_sys_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_global_sys_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB147_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB147_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB147_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB147_1;
+; SM60-NEXT:  $L__BB147_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire
+    ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_acquire_i16_global_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_global_cta_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_global_cta_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB148_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB148_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB148_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB148_1;
+; SM60-NEXT:  $L__BB148_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire
+    ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_acquire_i16_global_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_global_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_global_gpu_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_global_gpu_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB149_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB149_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB149_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB149_1;
+; SM60-NEXT:  $L__BB149_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire
+    ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_acquire_i16_shared_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_shared_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_shared_sys_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_shared_sys_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB150_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB150_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB150_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB150_1;
+; SM60-NEXT:  $L__BB150_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire
+    ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_acquire_i16_shared_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_shared_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_shared_cta_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_shared_cta_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB151_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB151_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB151_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB151_1;
+; SM60-NEXT:  $L__BB151_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire
+    ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_acquire_i16_shared_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_shared_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_shared_gpu_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_shared_gpu_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB152_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB152_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB152_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB152_1;
+; SM60-NEXT:  $L__BB152_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire
+    ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_seq_cst_i16_generic_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_sys_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB153_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB153_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB153_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB153_1;
+; SM60-NEXT:  $L__BB153_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst
+    ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_seq_cst_i16_generic_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_cta_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB154_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB154_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB154_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB154_1;
+; SM60-NEXT:  $L__BB154_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst
+    ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_seq_cst_i16_generic_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_gpu_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB155_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB155_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB155_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB155_1;
+; SM60-NEXT:  $L__BB155_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst
+    ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_seq_cst_i16_global_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_global_sys_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB156_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB156_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB156_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB156_1;
+; SM60-NEXT:  $L__BB156_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst
+    ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_seq_cst_i16_global_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_global_cta_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB157_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB157_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB157_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB157_1;
+; SM60-NEXT:  $L__BB157_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst
+    ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_seq_cst_i16_global_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_global_gpu_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB158_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB158_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB158_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB158_1;
+; SM60-NEXT:  $L__BB158_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst
+    ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_seq_cst_i16_shared_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_sys_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB159_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB159_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB159_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB159_1;
+; SM60-NEXT:  $L__BB159_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst
+    ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_seq_cst_i16_shared_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_cta_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB160_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB160_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB160_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB160_1;
+; SM60-NEXT:  $L__BB160_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst
+    ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_seq_cst_i16_shared_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_gpu_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB161_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB161_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB161_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB161_1;
+; SM60-NEXT:  $L__BB161_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst
+    ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_monotonic_i16_generic_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_generic_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_generic_sys_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_generic_sys_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB162_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB162_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB162_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB162_1;
+; SM60-NEXT:  $L__BB162_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic
+    ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_monotonic_i16_generic_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_generic_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_generic_cta_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_generic_cta_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB163_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB163_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB163_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB163_1;
+; SM60-NEXT:  $L__BB163_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic
+    ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_monotonic_i16_generic_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_generic_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_generic_gpu_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_generic_gpu_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB164_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB164_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB164_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB164_1;
+; SM60-NEXT:  $L__BB164_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic
+    ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_monotonic_i16_global_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_global_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_global_sys_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_global_sys_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB165_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB165_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB165_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB165_1;
+; SM60-NEXT:  $L__BB165_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic
+    ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_monotonic_i16_global_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_global_cta_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_global_cta_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB166_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB166_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB166_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB166_1;
+; SM60-NEXT:  $L__BB166_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic
+    ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_monotonic_i16_global_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_global_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_global_gpu_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_global_gpu_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB167_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB167_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB167_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB167_1;
+; SM60-NEXT:  $L__BB167_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic
+    ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_monotonic_i16_shared_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_shared_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_shared_sys_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_shared_sys_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB168_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB168_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB168_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB168_1;
+; SM60-NEXT:  $L__BB168_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic
+    ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_monotonic_i16_shared_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_shared_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_shared_cta_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_shared_cta_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB169_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB169_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB169_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB169_1;
+; SM60-NEXT:  $L__BB169_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic
+    ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_monotonic_i16_shared_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_shared_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_shared_gpu_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_shared_gpu_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB170_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB170_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB170_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB170_1;
+; SM60-NEXT:  $L__BB170_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic
+    ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_acquire_i16_generic_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_generic_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_generic_sys_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_generic_sys_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB171_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB171_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB171_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB171_1;
+; SM60-NEXT:  $L__BB171_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire acquire
+    ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_acquire_i16_generic_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_generic_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_generic_cta_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_generic_cta_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB172_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB172_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB172_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB172_1;
+; SM60-NEXT:  $L__BB172_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire
+    ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_acquire_i16_generic_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_generic_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_generic_gpu_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_generic_gpu_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB173_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB173_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB173_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB173_1;
+; SM60-NEXT:  $L__BB173_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire
+    ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_acquire_i16_global_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_global_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_global_sys_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_global_sys_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB174_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB174_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB174_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB174_1;
+; SM60-NEXT:  $L__BB174_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire acquire
+    ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_acquire_i16_global_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_global_cta_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_global_cta_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB175_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB175_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB175_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB175_1;
+; SM60-NEXT:  $L__BB175_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire
+    ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_acquire_i16_global_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_global_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_global_gpu_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_global_gpu_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB176_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB176_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB176_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB176_1;
+; SM60-NEXT:  $L__BB176_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire
+    ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_acquire_i16_shared_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_shared_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_shared_sys_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_shared_sys_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB177_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB177_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB177_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB177_1;
+; SM60-NEXT:  $L__BB177_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire acquire
+    ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_acquire_i16_shared_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_shared_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_shared_cta_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_shared_cta_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB178_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB178_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB178_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB178_1;
+; SM60-NEXT:  $L__BB178_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire
+    ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_acquire_i16_shared_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_shared_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_shared_gpu_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_shared_gpu_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB179_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB179_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB179_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB179_1;
+; SM60-NEXT:  $L__BB179_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire
+    ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_seq_cst_i16_generic_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_generic_sys_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB180_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB180_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB180_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB180_1;
+; SM60-NEXT:  $L__BB180_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst
+    ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_seq_cst_i16_generic_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_generic_cta_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB181_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB181_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB181_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB181_1;
+; SM60-NEXT:  $L__BB181_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst
+    ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_seq_cst_i16_generic_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_generic_gpu_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB182_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB182_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB182_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB182_1;
+; SM60-NEXT:  $L__BB182_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst
+    ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_seq_cst_i16_global_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_global_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_global_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_global_sys_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB183_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB183_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB183_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB183_1;
+; SM60-NEXT:  $L__BB183_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst
+    ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_seq_cst_i16_global_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_global_cta_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB184_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB184_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB184_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB184_1;
+; SM60-NEXT:  $L__BB184_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst
+    ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_seq_cst_i16_global_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_global_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_global_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_global_gpu_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB185_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB185_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB185_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB185_1;
+; SM60-NEXT:  $L__BB185_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst
+    ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_seq_cst_i16_shared_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_shared_sys_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB186_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB186_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB186_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB186_1;
+; SM60-NEXT:  $L__BB186_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst
+    ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_seq_cst_i16_shared_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_shared_cta_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB187_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB187_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB187_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB187_1;
+; SM60-NEXT:  $L__BB187_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst
+    ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_seq_cst_i16_shared_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_shared_gpu_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB188_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB188_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB188_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB188_1;
+; SM60-NEXT:  $L__BB188_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst
+    ret i16 %new
+}
+
+define i16 @release_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_monotonic_i16_generic_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_generic_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_generic_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_generic_sys_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB189_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB189_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB189_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB189_1;
+; SM60-NEXT:  $L__BB189_3: // %partword.cmpxchg.end
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release monotonic
+    ret i16 %new
+}
+
+define i16 @release_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_monotonic_i16_generic_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_generic_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_generic_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_generic_cta_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB190_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB190_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB190_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB190_1;
+; SM60-NEXT:  $L__BB190_3: // %partword.cmpxchg.end
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release monotonic
+    ret i16 %new
+}
+
+define i16 @release_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_monotonic_i16_generic_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_generic_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_generic_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_generic_gpu_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB191_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB191_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB191_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB191_1;
+; SM60-NEXT:  $L__BB191_3: // %partword.cmpxchg.end
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release monotonic
+    ret i16 %new
+}
+
+define i16 @release_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_monotonic_i16_global_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_global_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_global_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_global_sys_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB192_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB192_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB192_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB192_1;
+; SM60-NEXT:  $L__BB192_3: // %partword.cmpxchg.end
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release monotonic
+    ret i16 %new
+}
+
+define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_monotonic_i16_global_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_global_cta_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB193_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB193_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB193_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB193_1;
+; SM60-NEXT:  $L__BB193_3: // %partword.cmpxchg.end
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic
+    ret i16 %new
+}
+
+define i16 @release_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_monotonic_i16_global_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_global_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_global_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_global_gpu_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB194_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB194_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB194_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB194_1;
+; SM60-NEXT:  $L__BB194_3: // %partword.cmpxchg.end
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic
+    ret i16 %new
+}
+
+define i16 @release_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_monotonic_i16_shared_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_shared_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_shared_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_shared_sys_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB195_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB195_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB195_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB195_1;
+; SM60-NEXT:  $L__BB195_3: // %partword.cmpxchg.end
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release monotonic
+    ret i16 %new
+}
+
+define i16 @release_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_monotonic_i16_shared_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_shared_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_shared_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_shared_cta_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB196_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB196_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB196_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB196_1;
+; SM60-NEXT:  $L__BB196_3: // %partword.cmpxchg.end
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic
+    ret i16 %new
+}
+
+define i16 @release_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_monotonic_i16_shared_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_shared_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_shared_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_shared_gpu_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB197_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB197_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB197_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB197_1;
+; SM60-NEXT:  $L__BB197_3: // %partword.cmpxchg.end
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic
+    ret i16 %new
+}
+
+define i16 @release_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_acquire_i16_generic_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_generic_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_generic_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b16 %r9, [release_acquire_i16_generic_sys_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB198_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB198_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB198_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB198_1;
+; SM60-NEXT:  $L__BB198_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release acquire
+    ret i16 %new
+}
+
+define i16 @release_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_acquire_i16_generic_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_generic_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_generic_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b16 %r9, [release_acquire_i16_generic_cta_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB199_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB199_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB199_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB199_1;
+; SM60-NEXT:  $L__BB199_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release acquire
+    ret i16 %new
+}
+
+define i16 @release_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_acquire_i16_generic_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_generic_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_generic_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b16 %r9, [release_acquire_i16_generic_gpu_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB200_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB200_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB200_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB200_1;
+; SM60-NEXT:  $L__BB200_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release acquire
+    ret i16 %new
+}
+
+define i16 @release_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_acquire_i16_global_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_global_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_global_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b16 %r9, [release_acquire_i16_global_sys_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB201_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB201_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB201_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB201_1;
+; SM60-NEXT:  $L__BB201_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release acquire
+    ret i16 %new
+}
+
+define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_acquire_i16_global_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b16 %r9, [release_acquire_i16_global_cta_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB202_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB202_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB202_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB202_1;
+; SM60-NEXT:  $L__BB202_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release acquire
+    ret i16 %new
+}
+
+define i16 @release_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_acquire_i16_global_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_global_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_global_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b16 %r9, [release_acquire_i16_global_gpu_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB203_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB203_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB203_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB203_1;
+; SM60-NEXT:  $L__BB203_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release acquire
+    ret i16 %new
+}
+
+define i16 @release_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_acquire_i16_shared_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_shared_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_shared_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b16 %r9, [release_acquire_i16_shared_sys_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB204_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB204_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB204_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB204_1;
+; SM60-NEXT:  $L__BB204_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release acquire
+    ret i16 %new
+}
+
+define i16 @release_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_acquire_i16_shared_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_shared_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_shared_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b16 %r9, [release_acquire_i16_shared_cta_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB205_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB205_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB205_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB205_1;
+; SM60-NEXT:  $L__BB205_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release acquire
+    ret i16 %new
+}
+
+define i16 @release_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_acquire_i16_shared_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_shared_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_shared_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b16 %r9, [release_acquire_i16_shared_gpu_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB206_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB206_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB206_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB206_1;
+; SM60-NEXT:  $L__BB206_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release acquire
+    ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_seq_cst_i16_generic_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_generic_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_generic_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_generic_sys_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB207_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB207_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB207_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB207_1;
+; SM60-NEXT:  $L__BB207_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release seq_cst
+    ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_seq_cst_i16_generic_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_generic_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_generic_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_generic_cta_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB208_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB208_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB208_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB208_1;
+; SM60-NEXT:  $L__BB208_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst
+    ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_seq_cst_i16_generic_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_generic_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_generic_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_generic_gpu_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB209_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB209_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB209_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB209_1;
+; SM60-NEXT:  $L__BB209_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst
+    ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_seq_cst_i16_global_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_global_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_global_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_global_sys_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB210_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB210_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB210_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB210_1;
+; SM60-NEXT:  $L__BB210_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release seq_cst
+    ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_seq_cst_i16_global_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_global_cta_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB211_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB211_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB211_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB211_1;
+; SM60-NEXT:  $L__BB211_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst
+    ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_seq_cst_i16_global_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_global_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_global_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_global_gpu_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB212_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB212_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB212_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB212_1;
+; SM60-NEXT:  $L__BB212_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst
+    ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_seq_cst_i16_shared_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_shared_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_shared_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_shared_sys_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB213_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB213_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB213_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB213_1;
+; SM60-NEXT:  $L__BB213_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release seq_cst
+    ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_seq_cst_i16_shared_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_shared_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_shared_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_shared_cta_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB214_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB214_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB214_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB214_1;
+; SM60-NEXT:  $L__BB214_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst
+    ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_seq_cst_i16_shared_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_shared_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_shared_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_shared_gpu_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB215_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB215_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB215_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB215_1;
+; SM60-NEXT:  $L__BB215_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst
+    ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_monotonic_i16_generic_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_sys_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB216_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB216_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB216_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB216_1;
+; SM60-NEXT:  $L__BB216_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic
+    ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_monotonic_i16_generic_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_cta_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB217_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB217_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB217_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB217_1;
+; SM60-NEXT:  $L__BB217_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic
+    ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_monotonic_i16_generic_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_gpu_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB218_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB218_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB218_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB218_1;
+; SM60-NEXT:  $L__BB218_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic
+    ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_monotonic_i16_global_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_global_sys_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB219_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB219_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB219_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB219_1;
+; SM60-NEXT:  $L__BB219_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic
+    ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_monotonic_i16_global_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_global_cta_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB220_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB220_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB220_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB220_1;
+; SM60-NEXT:  $L__BB220_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic
+    ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_monotonic_i16_global_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_global_gpu_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB221_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB221_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB221_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB221_1;
+; SM60-NEXT:  $L__BB221_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic
+    ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_monotonic_i16_shared_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_sys_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB222_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB222_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB222_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB222_1;
+; SM60-NEXT:  $L__BB222_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic
+    ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_monotonic_i16_shared_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_cta_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB223_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB223_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB223_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB223_1;
+; SM60-NEXT:  $L__BB223_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic
+    ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_monotonic_i16_shared_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_gpu_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB224_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB224_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB224_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB224_1;
+; SM60-NEXT:  $L__BB224_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic
+    ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_acquire_i16_generic_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_generic_sys_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB225_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB225_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB225_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB225_1;
+; SM60-NEXT:  $L__BB225_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire
+    ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_acquire_i16_generic_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_generic_cta_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB226_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB226_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB226_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB226_1;
+; SM60-NEXT:  $L__BB226_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire
+    ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_acquire_i16_generic_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_generic_gpu_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB227_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB227_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB227_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB227_1;
+; SM60-NEXT:  $L__BB227_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire
+    ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_acquire_i16_global_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_global_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_global_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_global_sys_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB228_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB228_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB228_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB228_1;
+; SM60-NEXT:  $L__BB228_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire
+    ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_acquire_i16_global_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_global_cta_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB229_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB229_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB229_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB229_1;
+; SM60-NEXT:  $L__BB229_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire
+    ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_acquire_i16_global_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_global_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_global_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_global_gpu_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB230_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB230_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB230_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB230_1;
+; SM60-NEXT:  $L__BB230_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire
+    ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_acquire_i16_shared_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_shared_sys_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB231_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB231_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB231_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB231_1;
+; SM60-NEXT:  $L__BB231_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire
+    ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_acquire_i16_shared_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_shared_cta_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB232_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB232_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB232_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB232_1;
+; SM60-NEXT:  $L__BB232_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire
+    ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_acquire_i16_shared_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_shared_gpu_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB233_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB233_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB233_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB233_1;
+; SM60-NEXT:  $L__BB233_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire
+    ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i16_generic_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_sys_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB234_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB234_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB234_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB234_1;
+; SM60-NEXT:  $L__BB234_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst
+    ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i16_generic_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_cta_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB235_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB235_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB235_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB235_1;
+; SM60-NEXT:  $L__BB235_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst
+    ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i16_generic_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_gpu_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB236_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB236_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB236_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB236_1;
+; SM60-NEXT:  $L__BB236_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst
+    ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i16_global_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_sys_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB237_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB237_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB237_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB237_1;
+; SM60-NEXT:  $L__BB237_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst
+    ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i16_global_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_cta_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB238_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB238_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB238_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB238_1;
+; SM60-NEXT:  $L__BB238_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst
+    ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i16_global_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_gpu_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB239_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB239_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB239_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB239_1;
+; SM60-NEXT:  $L__BB239_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst
+    ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i16_shared_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_sys_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB240_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB240_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB240_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB240_1;
+; SM60-NEXT:  $L__BB240_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst
+    ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i16_shared_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_cta_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB241_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB241_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB241_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB241_1;
+; SM60-NEXT:  $L__BB241_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst
+    ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i16_shared_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_gpu_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB242_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB242_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB242_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB242_1;
+; SM60-NEXT:  $L__BB242_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst
+    ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_monotonic_i16_generic_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_sys_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB243_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB243_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB243_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB243_1;
+; SM60-NEXT:  $L__BB243_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic
+    ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_monotonic_i16_generic_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_cta_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB244_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB244_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB244_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB244_1;
+; SM60-NEXT:  $L__BB244_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic
+    ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_monotonic_i16_generic_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_gpu_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB245_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB245_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB245_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB245_1;
+; SM60-NEXT:  $L__BB245_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic
+    ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_monotonic_i16_global_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_global_sys_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB246_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB246_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB246_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB246_1;
+; SM60-NEXT:  $L__BB246_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic
+    ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_monotonic_i16_global_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_global_cta_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB247_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB247_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB247_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB247_1;
+; SM60-NEXT:  $L__BB247_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic
+    ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_monotonic_i16_global_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_global_gpu_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB248_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB248_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB248_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB248_1;
+; SM60-NEXT:  $L__BB248_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic
+    ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_monotonic_i16_shared_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_sys_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB249_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB249_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB249_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB249_1;
+; SM60-NEXT:  $L__BB249_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic
+    ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_monotonic_i16_shared_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_cta_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB250_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB250_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB250_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB250_1;
+; SM60-NEXT:  $L__BB250_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic
+    ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_monotonic_i16_shared_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_gpu_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB251_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB251_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB251_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB251_1;
+; SM60-NEXT:  $L__BB251_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic
+    ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_acquire_i16_generic_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_generic_sys_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB252_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB252_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB252_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB252_1;
+; SM60-NEXT:  $L__BB252_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire
+    ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_acquire_i16_generic_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_generic_cta_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB253_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB253_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB253_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB253_1;
+; SM60-NEXT:  $L__BB253_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire
+    ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_acquire_i16_generic_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_generic_gpu_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB254_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB254_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB254_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB254_1;
+; SM60-NEXT:  $L__BB254_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire
+    ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_acquire_i16_global_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_global_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_global_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_global_sys_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB255_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB255_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB255_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB255_1;
+; SM60-NEXT:  $L__BB255_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire
+    ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_acquire_i16_global_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_global_cta_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB256_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB256_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB256_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB256_1;
+; SM60-NEXT:  $L__BB256_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire
+    ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_acquire_i16_global_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_global_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_global_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_global_gpu_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB257_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB257_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB257_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB257_1;
+; SM60-NEXT:  $L__BB257_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire
+    ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_acquire_i16_shared_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_shared_sys_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB258_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB258_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB258_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB258_1;
+; SM60-NEXT:  $L__BB258_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire
+    ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_acquire_i16_shared_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_shared_cta_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB259_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB259_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB259_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB259_1;
+; SM60-NEXT:  $L__BB259_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire
+    ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_acquire_i16_shared_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_shared_gpu_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB260_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB260_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB260_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB260_1;
+; SM60-NEXT:  $L__BB260_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire
+    ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i16_generic_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_sys_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB261_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB261_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB261_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB261_1;
+; SM60-NEXT:  $L__BB261_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst
+    ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i16_generic_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_cta_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB262_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB262_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB262_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB262_1;
+; SM60-NEXT:  $L__BB262_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst
+    ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i16_generic_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_gpu_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB263_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB263_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB263_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB263_1;
+; SM60-NEXT:  $L__BB263_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst
+    ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i16_global_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_sys_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB264_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB264_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB264_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB264_1;
+; SM60-NEXT:  $L__BB264_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst
+    ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i16_global_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_cta_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB265_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB265_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB265_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB265_1;
+; SM60-NEXT:  $L__BB265_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst
+    ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i16_global_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_gpu_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB266_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB266_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB266_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB266_1;
+; SM60-NEXT:  $L__BB266_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst
+    ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i16_shared_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_sys_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB267_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB267_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB267_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB267_1;
+; SM60-NEXT:  $L__BB267_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst
+    ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i16_shared_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_cta_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB268_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB268_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB268_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB268_1;
+; SM60-NEXT:  $L__BB268_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst
+    ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i16_shared_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_gpu_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB269_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB269_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB269_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB269_1;
+; SM60-NEXT:  $L__BB269_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst
+    ret i16 %new
+}
+
+define i32 @monotonic_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_monotonic_i32_generic_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_sys_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_generic_sys_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_generic_sys_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic
+    ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_monotonic_i32_generic_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_cta_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_generic_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_generic_cta_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic
+    ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_monotonic_i32_generic_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_gpu_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_generic_gpu_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_generic_gpu_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic
+    ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_monotonic_i32_global_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_global_sys_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_global_sys_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_global_sys_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic
+    ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_monotonic_i32_global_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_global_cta_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_global_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_global_cta_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic
+    ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_monotonic_i32_global_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_global_gpu_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_global_gpu_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_global_gpu_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic
+    ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_monotonic_i32_shared_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_sys_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_shared_sys_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_shared_sys_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic
+    ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_monotonic_i32_shared_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_cta_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_shared_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_shared_cta_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic
+    ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_monotonic_i32_shared_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_gpu_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_shared_gpu_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_shared_gpu_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic
+    ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_acquire_i32_generic_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_generic_sys_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_generic_sys_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_generic_sys_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire
+    ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_acquire_i32_generic_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_generic_cta_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_generic_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_generic_cta_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire
+    ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_acquire_i32_generic_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_generic_gpu_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_generic_gpu_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_generic_gpu_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire
+    ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_acquire_i32_global_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_global_sys_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_global_sys_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_global_sys_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire
+    ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_acquire_i32_global_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_global_cta_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_global_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_global_cta_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire
+    ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_acquire_i32_global_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_global_gpu_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_global_gpu_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_global_gpu_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire
+    ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_acquire_i32_shared_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_shared_sys_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_shared_sys_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_shared_sys_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire
+    ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_acquire_i32_shared_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_shared_cta_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_shared_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_shared_cta_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire
+    ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_acquire_i32_shared_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_shared_gpu_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_shared_gpu_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_shared_gpu_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire
+    ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_seq_cst_i32_generic_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_sys_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_sys_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst
+    ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_seq_cst_i32_generic_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_cta_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst
+    ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_seq_cst_i32_generic_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_gpu_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_gpu_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst
+    ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_seq_cst_i32_global_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_global_sys_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_global_sys_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst
+    ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_seq_cst_i32_global_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_global_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_global_cta_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst
+    ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_seq_cst_i32_global_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_global_gpu_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_global_gpu_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst
+    ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_seq_cst_i32_shared_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_sys_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_sys_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst
+    ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_seq_cst_i32_shared_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_cta_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst
+    ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_seq_cst_i32_shared_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_gpu_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_gpu_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst
+    ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_monotonic_i32_generic_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_generic_sys_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_generic_sys_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_generic_sys_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic
+    ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_monotonic_i32_generic_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_generic_cta_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_generic_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_generic_cta_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic
+    ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_monotonic_i32_generic_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_generic_gpu_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_generic_gpu_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_generic_gpu_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic
+    ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_monotonic_i32_global_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_global_sys_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_global_sys_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_global_sys_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic
+    ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_monotonic_i32_global_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_global_cta_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_global_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_global_cta_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic
+    ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_monotonic_i32_global_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_global_gpu_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_global_gpu_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_global_gpu_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic
+    ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_monotonic_i32_shared_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_shared_sys_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_shared_sys_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_shared_sys_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic
+    ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_monotonic_i32_shared_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_shared_cta_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_shared_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_shared_cta_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic
+    ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_monotonic_i32_shared_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_shared_gpu_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_shared_gpu_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_shared_gpu_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic
+    ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_acquire_i32_generic_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_generic_sys_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_generic_sys_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_generic_sys_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire acquire
+    ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_acquire_i32_generic_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_generic_cta_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_generic_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_generic_cta_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire
+    ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_acquire_i32_generic_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_generic_gpu_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_generic_gpu_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_generic_gpu_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire
+    ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_acquire_i32_global_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_global_sys_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_global_sys_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_global_sys_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire acquire
+    ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_acquire_i32_global_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_global_cta_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_global_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_global_cta_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire
+    ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_acquire_i32_global_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_global_gpu_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_global_gpu_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_global_gpu_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire
+    ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_acquire_i32_shared_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_shared_sys_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_shared_sys_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_shared_sys_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire acquire
+    ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_acquire_i32_shared_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_shared_cta_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_shared_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_shared_cta_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire
+    ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_acquire_i32_shared_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_shared_gpu_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_shared_gpu_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_shared_gpu_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire
+    ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_seq_cst_i32_generic_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_generic_sys_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_generic_sys_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst
+    ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_seq_cst_i32_generic_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_generic_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_generic_cta_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst
+    ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_seq_cst_i32_generic_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_generic_gpu_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_generic_gpu_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst
+    ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_seq_cst_i32_global_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_global_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_global_sys_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_global_sys_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst
+    ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_seq_cst_i32_global_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_global_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_global_cta_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst
+    ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_seq_cst_i32_global_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_global_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_global_gpu_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_global_gpu_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst
+    ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_seq_cst_i32_shared_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_shared_sys_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_shared_sys_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst
+    ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_seq_cst_i32_shared_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_shared_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_shared_cta_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst
+    ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_seq_cst_i32_shared_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_shared_gpu_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_shared_gpu_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst
+    ret i32 %new
+}
+
+define i32 @release_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_monotonic_i32_generic_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_generic_sys_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_generic_sys_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_generic_sys_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release monotonic
+    ret i32 %new
+}
+
+define i32 @release_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_monotonic_i32_generic_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_generic_cta_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_generic_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_generic_cta_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release monotonic
+    ret i32 %new
+}
+
+define i32 @release_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_monotonic_i32_generic_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_generic_gpu_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_generic_gpu_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_generic_gpu_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release monotonic
+    ret i32 %new
+}
+
+define i32 @release_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_monotonic_i32_global_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_global_sys_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_global_sys_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_global_sys_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release monotonic
+    ret i32 %new
+}
+
+define i32 @release_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_monotonic_i32_global_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_global_cta_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_global_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_global_cta_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic
+    ret i32 %new
+}
+
+define i32 @release_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_monotonic_i32_global_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_global_gpu_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_global_gpu_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_global_gpu_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release monotonic
+    ret i32 %new
+}
+
+define i32 @release_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_monotonic_i32_shared_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_shared_sys_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_shared_sys_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_shared_sys_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release monotonic
+    ret i32 %new
+}
+
+define i32 @release_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_monotonic_i32_shared_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_shared_cta_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_shared_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_shared_cta_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic
+    ret i32 %new
+}
+
+define i32 @release_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_monotonic_i32_shared_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_shared_gpu_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_shared_gpu_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_shared_gpu_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release monotonic
+    ret i32 %new
+}
+
+define i32 @release_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_acquire_i32_generic_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_generic_sys_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [release_acquire_i32_generic_sys_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [release_acquire_i32_generic_sys_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release acquire
+    ret i32 %new
+}
+
+define i32 @release_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_acquire_i32_generic_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_generic_cta_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [release_acquire_i32_generic_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [release_acquire_i32_generic_cta_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release acquire
+    ret i32 %new
+}
+
+define i32 @release_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_acquire_i32_generic_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_generic_gpu_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [release_acquire_i32_generic_gpu_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [release_acquire_i32_generic_gpu_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release acquire
+    ret i32 %new
+}
+
+define i32 @release_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_acquire_i32_global_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_global_sys_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [release_acquire_i32_global_sys_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [release_acquire_i32_global_sys_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release acquire
+    ret i32 %new
+}
+
+define i32 @release_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_acquire_i32_global_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_global_cta_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [release_acquire_i32_global_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [release_acquire_i32_global_cta_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release acquire
+    ret i32 %new
+}
+
+define i32 @release_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_acquire_i32_global_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_global_gpu_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [release_acquire_i32_global_gpu_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [release_acquire_i32_global_gpu_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release acquire
+    ret i32 %new
+}
+
+define i32 @release_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_acquire_i32_shared_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_shared_sys_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [release_acquire_i32_shared_sys_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [release_acquire_i32_shared_sys_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release acquire
+    ret i32 %new
+}
+
+define i32 @release_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_acquire_i32_shared_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_shared_cta_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [release_acquire_i32_shared_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [release_acquire_i32_shared_cta_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release acquire
+    ret i32 %new
+}
+
+define i32 @release_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_acquire_i32_shared_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_shared_gpu_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [release_acquire_i32_shared_gpu_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [release_acquire_i32_shared_gpu_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release acquire
+    ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_seq_cst_i32_generic_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_generic_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_generic_sys_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_generic_sys_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release seq_cst
+    ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_seq_cst_i32_generic_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_generic_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_generic_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_generic_cta_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst
+    ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_seq_cst_i32_generic_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_generic_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_generic_gpu_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_generic_gpu_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst
+    ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_seq_cst_i32_global_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_global_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_global_sys_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_global_sys_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release seq_cst
+    ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_seq_cst_i32_global_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_global_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_global_cta_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst
+    ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_seq_cst_i32_global_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_global_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_global_gpu_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_global_gpu_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst
+    ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_seq_cst_i32_shared_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_shared_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_shared_sys_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_shared_sys_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release seq_cst
+    ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_seq_cst_i32_shared_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_shared_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_shared_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_shared_cta_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst
+    ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_seq_cst_i32_shared_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_shared_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_shared_gpu_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_shared_gpu_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst
+    ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_monotonic_i32_generic_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_sys_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_sys_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_sys_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic
+    ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_monotonic_i32_generic_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_cta_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_cta_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic
+    ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_monotonic_i32_generic_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_gpu_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_gpu_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_gpu_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic
+    ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_monotonic_i32_global_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_sys_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_global_sys_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_global_sys_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic
+    ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_monotonic_i32_global_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_cta_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_global_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_global_cta_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic
+    ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_monotonic_i32_global_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_gpu_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_global_gpu_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_global_gpu_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic
+    ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_monotonic_i32_shared_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_sys_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_sys_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_sys_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic
+    ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_monotonic_i32_shared_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_cta_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_cta_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic
+    ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_monotonic_i32_shared_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_gpu_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_gpu_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_gpu_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic
+    ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_acquire_i32_generic_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_sys_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_generic_sys_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_generic_sys_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire
+    ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_acquire_i32_generic_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_cta_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_generic_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_generic_cta_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
+    ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_acquire_i32_generic_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_gpu_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_generic_gpu_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_generic_gpu_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire
+    ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_acquire_i32_global_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_global_sys_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_global_sys_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_global_sys_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire
+    ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_acquire_i32_global_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_global_cta_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_global_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_global_cta_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
+    ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_acquire_i32_global_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_global_gpu_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_global_gpu_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_global_gpu_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire
+    ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_acquire_i32_shared_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_sys_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_shared_sys_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_shared_sys_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire
+    ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_acquire_i32_shared_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_cta_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_shared_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_shared_cta_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
+    ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_acquire_i32_shared_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_gpu_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_shared_gpu_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_shared_gpu_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire
+    ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i32_generic_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_sys_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_sys_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst
+    ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i32_generic_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_cta_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst
+    ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i32_generic_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_gpu_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_gpu_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst
+    ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i32_global_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_sys_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_sys_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst
+    ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i32_global_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_cta_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst
+    ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i32_global_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_gpu_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_gpu_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst
+    ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i32_shared_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_sys_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_sys_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst
+    ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i32_shared_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_cta_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst
+    ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i32_shared_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_gpu_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_gpu_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst
+    ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_monotonic_i32_generic_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_sys_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_sys_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic
+    ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_monotonic_i32_generic_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_cta_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic
+    ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_monotonic_i32_generic_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_gpu_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_gpu_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic
+    ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_monotonic_i32_global_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_global_sys_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_global_sys_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic
+    ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_monotonic_i32_global_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_global_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_global_cta_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic
+    ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_monotonic_i32_global_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_global_gpu_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_global_gpu_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic
+    ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_monotonic_i32_shared_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_sys_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_sys_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic
+    ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_monotonic_i32_shared_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_cta_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic
+    ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_monotonic_i32_shared_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_gpu_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_gpu_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic
+    ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_acquire_i32_generic_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_generic_sys_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_generic_sys_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire
+    ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_acquire_i32_generic_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_generic_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_generic_cta_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire
+    ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_acquire_i32_generic_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_generic_gpu_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_generic_gpu_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire
+    ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_acquire_i32_global_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_global_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_global_sys_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_global_sys_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire
+    ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_acquire_i32_global_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_global_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_global_cta_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire
+    ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_acquire_i32_global_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_global_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_global_gpu_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_global_gpu_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire
+    ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_acquire_i32_shared_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_shared_sys_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_shared_sys_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire
+    ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_acquire_i32_shared_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_shared_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_shared_cta_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire
+    ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_acquire_i32_shared_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_shared_gpu_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_shared_gpu_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire
+    ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i32_generic_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_sys_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_sys_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst
+    ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i32_generic_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_cta_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst
+    ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i32_generic_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_gpu_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_gpu_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst
+    ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i32_global_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_sys_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_sys_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst
+    ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i32_global_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_cta_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst
+    ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i32_global_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_gpu_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_gpu_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst
+    ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i32_shared_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_sys_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_sys_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst
+    ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i32_shared_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_cta_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst
+    ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i32_shared_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_gpu_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_gpu_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst
+    ret i32 %new
+}
+
+define i64 @monotonic_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_monotonic_i64_generic_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_sys_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_sys_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_sys_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic
+    ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_monotonic_i64_generic_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_cta_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic
+    ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_monotonic_i64_generic_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_gpu_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_gpu_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_gpu_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic
+    ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_monotonic_i64_global_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_global_sys_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_global_sys_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_global_sys_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic
+    ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_monotonic_i64_global_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_global_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_global_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_global_cta_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic
+    ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_monotonic_i64_global_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_global_gpu_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_global_gpu_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_global_gpu_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic
+    ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_monotonic_i64_shared_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_sys_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_sys_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_sys_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic
+    ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_monotonic_i64_shared_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_cta_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic
+    ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_monotonic_i64_shared_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_gpu_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_gpu_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_gpu_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic
+    ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_acquire_i64_generic_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_generic_sys_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_generic_sys_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_generic_sys_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire
+    ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_acquire_i64_generic_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_generic_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_generic_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_generic_cta_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire
+    ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_acquire_i64_generic_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_generic_gpu_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_generic_gpu_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_generic_gpu_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire
+    ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_acquire_i64_global_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_global_sys_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_global_sys_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_global_sys_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire
+    ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_acquire_i64_global_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_global_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_global_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_global_cta_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire
+    ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_acquire_i64_global_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_global_gpu_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_global_gpu_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_global_gpu_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire
+    ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_acquire_i64_shared_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_shared_sys_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_shared_sys_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_shared_sys_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire
+    ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_acquire_i64_shared_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_shared_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_shared_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_shared_cta_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire
+    ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_acquire_i64_shared_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_shared_gpu_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_shared_gpu_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_shared_gpu_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire
+    ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_seq_cst_i64_generic_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_sys_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_sys_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst
+    ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_seq_cst_i64_generic_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_cta_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst
+    ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_seq_cst_i64_generic_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_gpu_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_gpu_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst
+    ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_seq_cst_i64_global_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_sys_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_sys_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst
+    ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_seq_cst_i64_global_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_cta_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst
+    ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_seq_cst_i64_global_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_gpu_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_gpu_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst
+    ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_seq_cst_i64_shared_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_sys_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_sys_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst
+    ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_seq_cst_i64_shared_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_cta_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst
+    ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_seq_cst_i64_shared_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_gpu_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_gpu_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst
+    ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_monotonic_i64_generic_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_generic_sys_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_generic_sys_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_generic_sys_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic
+    ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_monotonic_i64_generic_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_generic_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_generic_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_generic_cta_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic
+    ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_monotonic_i64_generic_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_generic_gpu_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_generic_gpu_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_generic_gpu_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic
+    ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_monotonic_i64_global_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_global_sys_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_global_sys_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_global_sys_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic
+    ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_monotonic_i64_global_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_global_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_global_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_global_cta_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic
+    ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_monotonic_i64_global_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_global_gpu_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_global_gpu_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_global_gpu_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic
+    ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_monotonic_i64_shared_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_shared_sys_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_shared_sys_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_shared_sys_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic
+    ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_monotonic_i64_shared_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_shared_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_shared_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_shared_cta_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic
+    ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_monotonic_i64_shared_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_shared_gpu_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_shared_gpu_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_shared_gpu_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic
+    ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_acquire_i64_generic_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_generic_sys_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_generic_sys_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_generic_sys_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire acquire
+    ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_acquire_i64_generic_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_generic_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_generic_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_generic_cta_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire
+    ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_acquire_i64_generic_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_generic_gpu_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_generic_gpu_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_generic_gpu_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire
+    ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_acquire_i64_global_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_global_sys_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_global_sys_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_global_sys_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire acquire
+    ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_acquire_i64_global_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_global_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_global_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_global_cta_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire
+    ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_acquire_i64_global_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_global_gpu_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_global_gpu_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_global_gpu_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire
+    ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_acquire_i64_shared_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_shared_sys_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_shared_sys_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_shared_sys_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire acquire
+    ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_acquire_i64_shared_cta(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_shared_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_shared_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_shared_cta_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire
+    ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_acquire_i64_shared_gpu(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_shared_gpu_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_shared_gpu_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_shared_gpu_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst seq_cst
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire
+    ret i64 %new
 }
 
-define i32 @monotonic_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: monotonic_monotonic_i32_generic(
+define i64 @acquire_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_seq_cst_i64_generic_sys(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_generic_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_generic_param_2];
-; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_sys_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_sys_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic
-    ret i32 %new
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst
+    ret i64 %new
 }
 
-define i32 @monotonic_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: monotonic_monotonic_i32_global(
+define i64 @acquire_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_seq_cst_i64_generic_cta(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_global_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_global_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_global_param_2];
-; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_cta_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic monotonic
-    ret i32 %new
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst
+    ret i64 %new
 }
 
-define i32 @monotonic_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: monotonic_monotonic_i32_shared(
+define i64 @acquire_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_seq_cst_i64_generic_gpu(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_shared_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_gpu_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_gpu_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic monotonic
-    ret i32 %new
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst
+    ret i64 %new
 }
 
-define i32 @monotonic_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: monotonic_acquire_i32_generic(
+define i64 @acquire_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_seq_cst_i64_global_sys(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_generic_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_generic_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_generic_param_2];
-; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_global_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_global_sys_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_global_sys_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic acquire
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst
+    ret i64 %new
 }
 
-define i32 @monotonic_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: monotonic_acquire_i32_global(
+define i64 @acquire_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_seq_cst_i64_global_cta(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_global_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_global_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_global_param_2];
-; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_global_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_global_cta_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic acquire
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst
+    ret i64 %new
 }
 
-define i32 @monotonic_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: monotonic_acquire_i32_shared(
+define i64 @acquire_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_seq_cst_i64_global_gpu(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_shared_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_shared_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_global_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_global_gpu_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_global_gpu_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic acquire
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst
+    ret i64 %new
 }
 
-define i32 @monotonic_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: monotonic_seq_cst_i32_generic(
+define i64 @acquire_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_seq_cst_i64_shared_sys(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_sys_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_param_2];
-; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_sys_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_sys_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic seq_cst
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst
+    ret i64 %new
 }
 
-define i32 @monotonic_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: monotonic_seq_cst_i32_global(
+define i64 @acquire_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_seq_cst_i64_shared_cta(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_global_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_global_param_2];
-; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_cta_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic seq_cst
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst
+    ret i64 %new
 }
 
-define i32 @monotonic_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: monotonic_seq_cst_i32_shared(
+define i64 @acquire_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_seq_cst_i64_shared_gpu(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_gpu_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_gpu_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic seq_cst
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst
+    ret i64 %new
 }
 
-define i32 @acquire_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acquire_monotonic_i32_generic(
+define i64 @release_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_monotonic_i64_generic_sys(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_generic_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_generic_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_generic_param_2];
-; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_generic_sys_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_generic_sys_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_generic_sys_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire monotonic
-    ret i32 %new
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release monotonic
+    ret i64 %new
 }
 
-define i32 @acquire_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acquire_monotonic_i32_global(
+define i64 @release_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_monotonic_i64_generic_cta(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_global_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_global_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_global_param_2];
-; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_generic_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_generic_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_generic_cta_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire monotonic
-    ret i32 %new
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release monotonic
+    ret i64 %new
 }
 
-define i32 @acquire_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acquire_monotonic_i32_shared(
+define i64 @release_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_monotonic_i64_generic_gpu(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_shared_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_shared_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_generic_gpu_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_generic_gpu_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_generic_gpu_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire monotonic
-    ret i32 %new
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release monotonic
+    ret i64 %new
 }
 
-define i32 @acquire_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acquire_acquire_i32_generic(
+define i64 @release_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_monotonic_i64_global_sys(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_generic_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_generic_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_generic_param_2];
-; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_global_sys_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_global_sys_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_global_sys_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire acquire
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release monotonic
+    ret i64 %new
 }
 
-define i32 @acquire_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acquire_acquire_i32_global(
+define i64 @release_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_monotonic_i64_global_cta(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_global_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_global_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_global_param_2];
-; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_global_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_global_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_global_cta_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire acquire
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic
+    ret i64 %new
 }
 
-define i32 @acquire_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acquire_acquire_i32_shared(
+define i64 @release_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_monotonic_i64_global_gpu(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_shared_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_shared_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_global_gpu_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_global_gpu_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_global_gpu_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire acquire
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release monotonic
+    ret i64 %new
 }
 
-define i32 @acquire_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acquire_seq_cst_i32_generic(
+define i64 @release_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_monotonic_i64_shared_sys(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_generic_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_generic_param_2];
-; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_shared_sys_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_shared_sys_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_shared_sys_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire seq_cst
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release monotonic
+    ret i64 %new
 }
 
-define i32 @acquire_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acquire_seq_cst_i32_global(
+define i64 @release_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_monotonic_i64_shared_cta(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_global_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_global_param_2];
-; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_shared_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_shared_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_shared_cta_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire seq_cst
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic
+    ret i64 %new
 }
 
-define i32 @acquire_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acquire_seq_cst_i32_shared(
+define i64 @release_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_monotonic_i64_shared_gpu(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_shared_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_shared_gpu_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_shared_gpu_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_shared_gpu_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire seq_cst
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release monotonic
+    ret i64 %new
 }
 
-define i32 @release_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: release_monotonic_i32_generic(
+define i64 @release_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_acquire_i64_generic_sys(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_generic_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_generic_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_generic_param_2];
-; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_generic_sys_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_generic_sys_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_generic_sys_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release monotonic
-    ret i32 %new
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release acquire
+    ret i64 %new
 }
 
-define i32 @release_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: release_monotonic_i32_global(
+define i64 @release_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_acquire_i64_generic_cta(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_global_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_global_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_global_param_2];
-; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_generic_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_generic_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_generic_cta_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release monotonic
-    ret i32 %new
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release acquire
+    ret i64 %new
 }
 
-define i32 @release_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: release_monotonic_i32_shared(
+define i64 @release_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_acquire_i64_generic_gpu(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_shared_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_shared_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_generic_gpu_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_generic_gpu_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_generic_gpu_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release monotonic
-    ret i32 %new
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release acquire
+    ret i64 %new
 }
 
-define i32 @release_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: release_acquire_i32_generic(
+define i64 @release_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_acquire_i64_global_sys(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_generic_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [release_acquire_i32_generic_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [release_acquire_i32_generic_param_2];
-; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_global_sys_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_global_sys_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_global_sys_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release acquire
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release acquire
+    ret i64 %new
 }
 
-define i32 @release_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: release_acquire_i32_global(
+define i64 @release_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_acquire_i64_global_cta(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_global_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [release_acquire_i32_global_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [release_acquire_i32_global_param_2];
-; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_global_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_global_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_global_cta_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release acquire
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release acquire
+    ret i64 %new
 }
 
-define i32 @release_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: release_acquire_i32_shared(
+define i64 @release_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_acquire_i64_global_gpu(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_shared_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [release_acquire_i32_shared_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [release_acquire_i32_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_global_gpu_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_global_gpu_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_global_gpu_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release acquire
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release acquire
+    ret i64 %new
 }
 
-define i32 @release_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: release_seq_cst_i32_generic(
+define i64 @release_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_acquire_i64_shared_sys(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_generic_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_generic_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_generic_param_2];
-; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_shared_sys_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_shared_sys_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_shared_sys_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release seq_cst
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release acquire
+    ret i64 %new
 }
 
-define i32 @release_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: release_seq_cst_i32_global(
+define i64 @release_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_acquire_i64_shared_cta(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_global_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_global_param_2];
-; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_shared_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_shared_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_shared_cta_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release seq_cst
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release acquire
+    ret i64 %new
 }
 
-define i32 @release_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: release_seq_cst_i32_shared(
+define i64 @release_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_acquire_i64_shared_gpu(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_shared_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_shared_gpu_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_shared_gpu_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_shared_gpu_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release seq_cst
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release acquire
+    ret i64 %new
 }
 
-define i32 @acq_rel_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acq_rel_monotonic_i32_generic(
+define i64 @release_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_seq_cst_i64_generic_sys(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_param_2];
-; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_generic_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_generic_sys_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_generic_sys_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel monotonic
-    ret i32 %new
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release seq_cst
+    ret i64 %new
 }
 
-define i32 @acq_rel_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acq_rel_monotonic_i32_global(
+define i64 @release_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_seq_cst_i64_generic_cta(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_global_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_global_param_2];
-; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_generic_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_generic_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_generic_cta_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel monotonic
-    ret i32 %new
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst
+    ret i64 %new
 }
 
-define i32 @acq_rel_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acq_rel_monotonic_i32_shared(
+define i64 @release_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_seq_cst_i64_generic_gpu(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_generic_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_generic_gpu_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_generic_gpu_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel monotonic
-    ret i32 %new
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst
+    ret i64 %new
 }
 
-define i32 @acq_rel_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acq_rel_acquire_i32_generic(
+define i64 @release_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_seq_cst_i64_global_sys(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_generic_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_generic_param_2];
-; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_global_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_global_sys_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_global_sys_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel acquire
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release seq_cst
+    ret i64 %new
 }
 
-define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acq_rel_acquire_i32_global(
+define i64 @release_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_seq_cst_i64_global_cta(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_global_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_global_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_global_param_2];
-; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_global_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_global_cta_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst
+    ret i64 %new
 }
 
-define i32 @acq_rel_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acq_rel_acquire_i32_shared(
+define i64 @release_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_seq_cst_i64_global_gpu(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_shared_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_global_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_global_gpu_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_global_gpu_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel acquire
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst
+    ret i64 %new
 }
 
-define i32 @acq_rel_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acq_rel_seq_cst_i32_generic(
+define i64 @release_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_seq_cst_i64_shared_sys(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_shared_sys_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_param_2];
-; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_shared_sys_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_shared_sys_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel seq_cst
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release seq_cst
+    ret i64 %new
 }
 
-define i32 @acq_rel_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acq_rel_seq_cst_i32_global(
+define i64 @release_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_seq_cst_i64_shared_cta(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_param_2];
-; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_shared_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_shared_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_shared_cta_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel seq_cst
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst
+    ret i64 %new
 }
 
-define i32 @acq_rel_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acq_rel_seq_cst_i32_shared(
+define i64 @release_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_seq_cst_i64_shared_gpu(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_shared_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_shared_gpu_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_shared_gpu_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel seq_cst
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst
+    ret i64 %new
 }
 
-define i32 @seq_cst_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: seq_cst_monotonic_i32_generic(
+define i64 @acq_rel_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_monotonic_i64_generic_sys(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_param_2];
-; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_sys_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_sys_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_sys_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst monotonic
-    ret i32 %new
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic
+    ret i64 %new
 }
 
-define i32 @seq_cst_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: seq_cst_monotonic_i32_global(
+define i64 @acq_rel_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_monotonic_i64_generic_cta(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_global_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_global_param_2];
-; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_cta_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst monotonic
-    ret i32 %new
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic
+    ret i64 %new
 }
 
-define i32 @seq_cst_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: seq_cst_monotonic_i32_shared(
+define i64 @acq_rel_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_monotonic_i64_generic_gpu(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_gpu_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_gpu_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_gpu_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst monotonic
-    ret i32 %new
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic
+    ret i64 %new
 }
 
-define i32 @seq_cst_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: seq_cst_acquire_i32_generic(
+define i64 @acq_rel_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_monotonic_i64_global_sys(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_generic_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_generic_param_2];
-; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_sys_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_sys_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_sys_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst acquire
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic
+    ret i64 %new
 }
 
-define i32 @seq_cst_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: seq_cst_acquire_i32_global(
+define i64 @acq_rel_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_monotonic_i64_global_cta(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_global_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_global_param_2];
-; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_cta_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst acquire
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic
+    ret i64 %new
 }
 
-define i32 @seq_cst_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: seq_cst_acquire_i32_shared(
+define i64 @acq_rel_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_monotonic_i64_global_gpu(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_shared_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_gpu_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_gpu_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_gpu_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst acquire
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic
+    ret i64 %new
 }
 
-define i32 @seq_cst_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: seq_cst_seq_cst_i32_generic(
+define i64 @acq_rel_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_monotonic_i64_shared_sys(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_param_2];
-; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_sys_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_sys_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_sys_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic
+    ret i64 %new
 }
 
-define i32 @seq_cst_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: seq_cst_seq_cst_i32_global(
+define i64 @acq_rel_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_monotonic_i64_shared_cta(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_param_2];
-; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_cta_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst seq_cst
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic
+    ret i64 %new
 }
 
-define i32 @seq_cst_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: seq_cst_seq_cst_i32_shared(
+define i64 @acq_rel_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_monotonic_i64_shared_gpu(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_gpu_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_gpu_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_gpu_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst seq_cst
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic
+    ret i64 %new
 }
 
-define i64 @monotonic_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: monotonic_monotonic_i64_generic(
+define i64 @acq_rel_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_acquire_i64_generic_sys(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_sys_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_sys_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_sys_param_2];
 ; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire
     ret i64 %new
 }
 
-define i64 @monotonic_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: monotonic_monotonic_i64_global(
+define i64 @acq_rel_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_acquire_i64_generic_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_global_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_global_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_global_param_2];
-; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_cta_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic monotonic
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire
     ret i64 %new
 }
 
-define i64 @monotonic_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: monotonic_monotonic_i64_shared(
+define i64 @acq_rel_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_acquire_i64_generic_gpu(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_gpu_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_gpu_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_gpu_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic monotonic
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire
     ret i64 %new
 }
 
-define i64 @monotonic_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: monotonic_acquire_i64_generic(
+define i64 @acq_rel_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_acquire_i64_global_sys(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_generic_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_generic_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_generic_param_2];
-; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_global_sys_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_global_sys_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_global_sys_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire
     ret i64 %new
 }
 
-define i64 @monotonic_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: monotonic_acquire_i64_global(
+define i64 @acq_rel_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_acquire_i64_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_global_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_global_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_global_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_global_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_global_cta_param_2];
 ; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire
     ret i64 %new
 }
 
-define i64 @monotonic_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: monotonic_acquire_i64_shared(
+define i64 @acq_rel_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_acquire_i64_global_gpu(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_shared_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_shared_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_global_gpu_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_global_gpu_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_global_gpu_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire
     ret i64 %new
 }
 
-define i64 @monotonic_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: monotonic_seq_cst_i64_generic(
+define i64 @acq_rel_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_acquire_i64_shared_sys(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_param_2];
-; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_sys_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_sys_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_sys_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire
     ret i64 %new
 }
 
-define i64 @monotonic_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: monotonic_seq_cst_i64_global(
+define i64 @acq_rel_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_acquire_i64_shared_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_param_2];
-; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_cta_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire
     ret i64 %new
 }
 
-define i64 @monotonic_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: monotonic_seq_cst_i64_shared(
+define i64 @acq_rel_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_acquire_i64_shared_gpu(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_gpu_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_gpu_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_gpu_param_2];
 ; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire
     ret i64 %new
 }
 
-define i64 @acquire_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acquire_monotonic_i64_generic(
+define i64 @acq_rel_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i64_generic_sys(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_generic_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_generic_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_sys_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_sys_param_2];
 ; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire monotonic
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst
     ret i64 %new
 }
 
-define i64 @acquire_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acquire_monotonic_i64_global(
+define i64 @acq_rel_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i64_generic_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_global_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_global_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_global_param_2];
-; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_cta_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire monotonic
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst
     ret i64 %new
 }
 
-define i64 @acquire_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acquire_monotonic_i64_shared(
+define i64 @acq_rel_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i64_generic_gpu(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_shared_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_shared_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_gpu_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_gpu_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire monotonic
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst
     ret i64 %new
 }
 
-define i64 @acquire_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acquire_acquire_i64_generic(
+define i64 @acq_rel_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i64_global_sys(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_generic_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_generic_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_generic_param_2];
-; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_sys_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_sys_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst
     ret i64 %new
 }
 
-define i64 @acquire_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acquire_acquire_i64_global(
+define i64 @acq_rel_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i64_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_global_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_global_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_cta_param_2];
 ; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst
     ret i64 %new
 }
 
-define i64 @acquire_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acquire_acquire_i64_shared(
+define i64 @acq_rel_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i64_global_gpu(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_shared_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_shared_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_gpu_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_gpu_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst
     ret i64 %new
 }
 
-define i64 @acquire_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acquire_seq_cst_i64_generic(
+define i64 @acq_rel_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i64_shared_sys(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_sys_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_param_2];
-; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_sys_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_sys_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst
     ret i64 %new
 }
 
-define i64 @acquire_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acquire_seq_cst_i64_global(
+define i64 @acq_rel_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i64_shared_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_global_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_global_param_2];
-; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_cta_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst
     ret i64 %new
 }
 
-define i64 @acquire_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acquire_seq_cst_i64_shared(
+define i64 @acq_rel_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i64_shared_gpu(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_gpu_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_gpu_param_2];
 ; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst
     ret i64 %new
 }
 
-define i64 @release_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: release_monotonic_i64_generic(
+define i64 @seq_cst_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_monotonic_i64_generic_sys(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_generic_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_generic_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_sys_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_sys_param_2];
 ; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release monotonic
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic
     ret i64 %new
 }
 
-define i64 @release_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: release_monotonic_i64_global(
+define i64 @seq_cst_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_monotonic_i64_generic_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_global_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_global_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_global_param_2];
-; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_cta_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release monotonic
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic
     ret i64 %new
 }
 
-define i64 @release_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: release_monotonic_i64_shared(
+define i64 @seq_cst_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_monotonic_i64_generic_gpu(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_shared_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_shared_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_gpu_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_gpu_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release monotonic
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic
     ret i64 %new
 }
 
-define i64 @release_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: release_acquire_i64_generic(
+define i64 @seq_cst_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_monotonic_i64_global_sys(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_generic_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_generic_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_generic_param_2];
-; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_sys_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_sys_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic
     ret i64 %new
 }
 
-define i64 @release_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: release_acquire_i64_global(
+define i64 @seq_cst_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_monotonic_i64_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_global_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_global_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_cta_param_2];
 ; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic
     ret i64 %new
 }
 
-define i64 @release_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: release_acquire_i64_shared(
+define i64 @seq_cst_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_monotonic_i64_global_gpu(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_shared_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_shared_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_gpu_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_gpu_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic
     ret i64 %new
 }
 
-define i64 @release_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: release_seq_cst_i64_generic(
+define i64 @seq_cst_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_monotonic_i64_shared_sys(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_generic_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_sys_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_generic_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_generic_param_2];
-; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_sys_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_sys_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic
     ret i64 %new
 }
 
-define i64 @release_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: release_seq_cst_i64_global(
+define i64 @seq_cst_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_monotonic_i64_shared_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_global_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_global_param_2];
-; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_cta_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic
     ret i64 %new
 }
 
-define i64 @release_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: release_seq_cst_i64_shared(
+define i64 @seq_cst_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_monotonic_i64_shared_gpu(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_shared_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_gpu_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_gpu_param_2];
 ; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic
     ret i64 %new
 }
 
-define i64 @acq_rel_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acq_rel_monotonic_i64_generic(
+define i64 @seq_cst_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_acquire_i64_generic_sys(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_sys_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_sys_param_2];
 ; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel monotonic
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire
     ret i64 %new
 }
 
-define i64 @acq_rel_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acq_rel_monotonic_i64_global(
+define i64 @seq_cst_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_acquire_i64_generic_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_param_2];
-; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_cta_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel monotonic
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire
     ret i64 %new
 }
 
-define i64 @acq_rel_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acq_rel_monotonic_i64_shared(
+define i64 @seq_cst_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_acquire_i64_generic_gpu(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_gpu_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_gpu_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel monotonic
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire
     ret i64 %new
 }
 
-define i64 @acq_rel_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acq_rel_acquire_i64_generic(
+define i64 @seq_cst_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_acquire_i64_global_sys(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_param_2];
-; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_global_sys_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_global_sys_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_global_sys_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire
     ret i64 %new
 }
 
-define i64 @acq_rel_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acq_rel_acquire_i64_global(
+define i64 @seq_cst_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_acquire_i64_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_global_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_global_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_global_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_global_cta_param_2];
 ; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire
     ret i64 %new
 }
 
-define i64 @acq_rel_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acq_rel_acquire_i64_shared(
+define i64 @seq_cst_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_acquire_i64_global_gpu(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_global_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_global_gpu_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_global_gpu_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire
     ret i64 %new
 }
 
-define i64 @acq_rel_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acq_rel_seq_cst_i64_generic(
+define i64 @seq_cst_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_acquire_i64_shared_sys(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_sys_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_param_2];
-; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_sys_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_sys_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire
     ret i64 %new
 }
 
-define i64 @acq_rel_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acq_rel_seq_cst_i64_global(
+define i64 @seq_cst_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_acquire_i64_shared_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_param_2];
-; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_cta_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire
     ret i64 %new
 }
 
-define i64 @acq_rel_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acq_rel_seq_cst_i64_shared(
+define i64 @seq_cst_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_acquire_i64_shared_gpu(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_gpu_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_gpu_param_2];
 ; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire
     ret i64 %new
 }
 
-define i64 @seq_cst_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: seq_cst_monotonic_i64_generic(
+define i64 @seq_cst_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i64_generic_sys(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_sys_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_sys_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_sys_param_2];
 ; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst monotonic
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst
     ret i64 %new
 }
 
-define i64 @seq_cst_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: seq_cst_monotonic_i64_global(
+define i64 @seq_cst_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i64_generic_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_param_2];
-; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_cta_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst monotonic
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst
     ret i64 %new
 }
 
-define i64 @seq_cst_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: seq_cst_monotonic_i64_shared(
+define i64 @seq_cst_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i64_generic_gpu(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_gpu_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_gpu_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst monotonic
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst
     ret i64 %new
 }
 
-define i64 @seq_cst_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: seq_cst_acquire_i64_generic(
+define i64 @seq_cst_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i64_global_sys(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_sys_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_param_2];
-; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_sys_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_sys_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst
     ret i64 %new
 }
 
-define i64 @seq_cst_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: seq_cst_acquire_i64_global(
+define i64 @seq_cst_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i64_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_global_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_cta_param_2];
 ; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst
     ret i64 %new
 }
 
-define i64 @seq_cst_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: seq_cst_acquire_i64_shared(
+define i64 @seq_cst_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i64_global_gpu(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_gpu_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_gpu_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst
     ret i64 %new
 }
 
-define i64 @seq_cst_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: seq_cst_seq_cst_i64_generic(
+define i64 @seq_cst_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i64_shared_sys(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_sys_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_param_2];
-; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_sys_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_sys_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst
     ret i64 %new
 }
 
-define i64 @seq_cst_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: seq_cst_seq_cst_i64_global(
+define i64 @seq_cst_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i64_shared_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_param_2];
-; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_cta_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst
     ret i64 %new
 }
 
-define i64 @seq_cst_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: seq_cst_seq_cst_i64_shared(
+define i64 @seq_cst_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i64_shared_gpu(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_gpu_param_0];
+; SM60-NEXT:    membar.gl;
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_gpu_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_gpu_param_2];
 ; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst
     ret i64 %new
 }
 
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll
index 7107fbcf6eb54..4096b038b2352 100644
--- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll
@@ -2,8 +2,8 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | FileCheck %s --check-prefix=SM70
 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %}
 
-define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: monotonic_monotonic_i8_generic(
+define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_monotonic_i8_generic_sys(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -11,8 +11,8 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_sys_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -23,7 +23,7 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [monotonic_monotonic_i8_generic_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [monotonic_monotonic_i8_generic_sys_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
@@ -31,7 +31,7 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB0_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -43,12 +43,12 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:  $L__BB0_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic
     ret i8 %new
 }
 
-define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: monotonic_monotonic_i8_global(
+define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_monotonic_i8_generic_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -56,8 +56,8 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_global_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_cta_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -68,15 +68,15 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [monotonic_monotonic_i8_global_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [monotonic_monotonic_i8_generic_cta_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB1_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB1_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -88,12 +88,12 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM70-NEXT:  $L__BB1_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic
     ret i8 %new
 }
 
-define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: monotonic_monotonic_i8_shared(
+define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_monotonic_i8_generic_gpu(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -101,8 +101,8 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_gpu_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -113,15 +113,15 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [monotonic_monotonic_i8_shared_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [monotonic_monotonic_i8_generic_gpu_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB2_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB2_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -133,12 +133,12 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne
 ; SM70-NEXT:  $L__BB2_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic
     ret i8 %new
 }
 
-define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: monotonic_acquire_i8_generic(
+define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_monotonic_i8_global_sys(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -146,8 +146,8 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_generic_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_global_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_global_sys_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -158,15 +158,15 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [monotonic_acquire_i8_generic_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [monotonic_monotonic_i8_global_sys_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.b32 %r16, [%rd1];
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB3_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB3_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -176,15 +176,14 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB3_1;
 ; SM70-NEXT:  $L__BB3_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic
     ret i8 %new
 }
 
-define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: monotonic_acquire_i8_global(
+define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_monotonic_i8_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -192,8 +191,8 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_global_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cta_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -204,7 +203,7 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [monotonic_acquire_i8_global_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [monotonic_monotonic_i8_global_cta_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
@@ -212,7 +211,7 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB4_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -222,15 +221,14 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB4_1;
 ; SM70-NEXT:  $L__BB4_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic
     ret i8 %new
 }
 
-define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: monotonic_acquire_i8_shared(
+define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_monotonic_i8_global_gpu(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -238,8 +236,8 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_shared_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_global_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_global_gpu_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -250,15 +248,15 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [monotonic_acquire_i8_shared_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [monotonic_monotonic_i8_global_gpu_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB5_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB5_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -268,15 +266,14 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB5_1;
 ; SM70-NEXT:  $L__BB5_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic
     ret i8 %new
 }
 
-define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: monotonic_seq_cst_i8_generic(
+define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_monotonic_i8_shared_sys(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -284,9 +281,8 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_param_0];
-; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_sys_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -297,15 +293,15 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [monotonic_monotonic_i8_shared_sys_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.b32 %r16, [%rd1];
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB6_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB6_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -315,15 +311,14 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB6_1;
 ; SM70-NEXT:  $L__BB6_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic
     ret i8 %new
 }
 
-define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: monotonic_seq_cst_i8_global(
+define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_monotonic_i8_shared_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -331,9 +326,8 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_cta_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -344,15 +338,15 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [monotonic_seq_cst_i8_global_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [monotonic_monotonic_i8_shared_cta_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB7_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB7_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -362,15 +356,14 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB7_1;
 ; SM70-NEXT:  $L__BB7_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic
     ret i8 %new
 }
 
-define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: monotonic_seq_cst_i8_shared(
+define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_monotonic_i8_shared_gpu(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -378,9 +371,8 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_gpu_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -391,7 +383,7 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [monotonic_monotonic_i8_shared_gpu_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
@@ -399,7 +391,7 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB8_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -409,15 +401,14 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB8_1;
 ; SM70-NEXT:  $L__BB8_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic
     ret i8 %new
 }
 
-define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acquire_monotonic_i8_generic(
+define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_acquire_i8_generic_sys(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -425,8 +416,8 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_generic_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_generic_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_generic_sys_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -437,7 +428,7 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [acquire_monotonic_i8_generic_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [monotonic_acquire_i8_generic_sys_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
@@ -445,7 +436,7 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB9_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -458,12 +449,12 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    fence.acq_rel.sys;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire
     ret i8 %new
 }
 
-define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acquire_monotonic_i8_global(
+define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_acquire_i8_generic_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -471,8 +462,8 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_global_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_generic_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_generic_cta_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -483,15 +474,15 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [acquire_monotonic_i8_global_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [monotonic_acquire_i8_generic_cta_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB10_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB10_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -501,15 +492,15 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB10_1;
 ; SM70-NEXT:  $L__BB10_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    fence.acq_rel.cta;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire
     ret i8 %new
 }
 
-define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acquire_monotonic_i8_shared(
+define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_acquire_i8_generic_gpu(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -517,8 +508,8 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_shared_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_generic_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_generic_gpu_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -529,15 +520,15 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [acquire_monotonic_i8_shared_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [monotonic_acquire_i8_generic_gpu_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB11_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB11_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -547,15 +538,15 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB11_1;
 ; SM70-NEXT:  $L__BB11_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    fence.acq_rel.gpu;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire
     ret i8 %new
 }
 
-define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acquire_acquire_i8_generic(
+define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_acquire_i8_global_sys(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -563,8 +554,8 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_generic_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_global_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_global_sys_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -575,15 +566,15 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [acquire_acquire_i8_generic_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [monotonic_acquire_i8_global_sys_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.b32 %r16, [%rd1];
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB12_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB12_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -596,12 +587,12 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    fence.acq_rel.sys;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire
     ret i8 %new
 }
 
-define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acquire_acquire_i8_global(
+define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_acquire_i8_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -609,8 +600,8 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_global_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_global_cta_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -621,7 +612,7 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [acquire_acquire_i8_global_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [monotonic_acquire_i8_global_cta_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
@@ -629,7 +620,7 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB13_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -639,15 +630,15 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB13_1;
 ; SM70-NEXT:  $L__BB13_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    fence.acq_rel.cta;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire
     ret i8 %new
 }
 
-define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acquire_acquire_i8_shared(
+define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_acquire_i8_global_gpu(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -655,8 +646,8 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_shared_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_global_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_global_gpu_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -667,15 +658,15 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [acquire_acquire_i8_shared_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [monotonic_acquire_i8_global_gpu_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB14_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB14_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -685,15 +676,15 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB14_1;
 ; SM70-NEXT:  $L__BB14_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    fence.acq_rel.gpu;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire
     ret i8 %new
 }
 
-define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acquire_seq_cst_i8_generic(
+define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_acquire_i8_shared_sys(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -701,9 +692,8 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_param_0];
-; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_shared_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_shared_sys_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -714,15 +704,15 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [acquire_seq_cst_i8_generic_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [monotonic_acquire_i8_shared_sys_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.b32 %r16, [%rd1];
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB15_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB15_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -735,12 +725,12 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    fence.acq_rel.sys;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire
     ret i8 %new
 }
 
-define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acquire_seq_cst_i8_global(
+define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_acquire_i8_shared_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -748,9 +738,8 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_shared_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_shared_cta_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -761,15 +750,15 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [acquire_seq_cst_i8_global_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [monotonic_acquire_i8_shared_cta_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB16_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB16_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -779,15 +768,15 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB16_1;
 ; SM70-NEXT:  $L__BB16_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    fence.acq_rel.cta;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire
     ret i8 %new
 }
 
-define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acquire_seq_cst_i8_shared(
+define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_acquire_i8_shared_gpu(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -795,9 +784,8 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_shared_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_shared_gpu_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -808,7 +796,7 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [acquire_seq_cst_i8_shared_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [monotonic_acquire_i8_shared_gpu_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
@@ -816,7 +804,7 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB17_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -826,15 +814,15 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB17_1;
 ; SM70-NEXT:  $L__BB17_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    fence.acq_rel.gpu;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire
     ret i8 %new
 }
 
-define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: release_monotonic_i8_generic(
+define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_seq_cst_i8_generic_sys(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -842,9 +830,9 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_generic_param_0];
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -855,7 +843,7 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [release_monotonic_i8_generic_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_sys_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
@@ -863,7 +851,7 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB18_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -873,14 +861,15 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB18_1;
 ; SM70-NEXT:  $L__BB18_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst
     ret i8 %new
 }
 
-define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: release_monotonic_i8_global(
+define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_seq_cst_i8_generic_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -888,9 +877,9 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_global_param_0];
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -901,15 +890,15 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [release_monotonic_i8_global_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_cta_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB19_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB19_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -919,14 +908,15 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB19_1;
 ; SM70-NEXT:  $L__BB19_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst
     ret i8 %new
 }
 
-define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: release_monotonic_i8_shared(
+define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_seq_cst_i8_generic_gpu(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -934,9 +924,9 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_shared_param_0];
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -947,15 +937,15 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [release_monotonic_i8_shared_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_gpu_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB20_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB20_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -965,14 +955,15 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB20_1;
 ; SM70-NEXT:  $L__BB20_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst
     ret i8 %new
 }
 
-define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: release_acquire_i8_generic(
+define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_seq_cst_i8_global_sys(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -980,9 +971,9 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_generic_param_0];
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -993,15 +984,15 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [release_acquire_i8_generic_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [monotonic_seq_cst_i8_global_sys_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.b32 %r16, [%rd1];
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB21_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB21_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1014,12 +1005,12 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    fence.acq_rel.sys;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst
     ret i8 %new
 }
 
-define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: release_acquire_i8_global(
+define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_seq_cst_i8_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -1027,9 +1018,9 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_global_param_0];
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -1040,7 +1031,7 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [release_acquire_i8_global_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [monotonic_seq_cst_i8_global_cta_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
@@ -1048,7 +1039,7 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB22_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1058,15 +1049,15 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB22_1;
 ; SM70-NEXT:  $L__BB22_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    fence.acq_rel.cta;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst
     ret i8 %new
 }
 
-define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: release_acquire_i8_shared(
+define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_seq_cst_i8_global_gpu(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -1074,9 +1065,9 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_shared_param_0];
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -1087,15 +1078,15 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [release_acquire_i8_shared_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [monotonic_seq_cst_i8_global_gpu_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB23_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB23_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1105,15 +1096,15 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB23_1;
 ; SM70-NEXT:  $L__BB23_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    fence.acq_rel.gpu;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst
     ret i8 %new
 }
 
-define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: release_seq_cst_i8_generic(
+define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_seq_cst_i8_shared_sys(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -1121,8 +1112,8 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_generic_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_sys_param_0];
 ; SM70-NEXT:    fence.sc.sys;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1134,15 +1125,15 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [release_seq_cst_i8_generic_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_sys_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.b32 %r16, [%rd1];
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB24_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB24_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1155,12 +1146,12 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    fence.acq_rel.sys;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst
     ret i8 %new
 }
 
-define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: release_seq_cst_i8_global(
+define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_seq_cst_i8_shared_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -1168,9 +1159,9 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -1181,15 +1172,15 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [release_seq_cst_i8_global_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_cta_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB25_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB25_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1199,15 +1190,15 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB25_1;
 ; SM70-NEXT:  $L__BB25_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    fence.acq_rel.cta;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst
     ret i8 %new
 }
 
-define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: release_seq_cst_i8_shared(
+define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_seq_cst_i8_shared_gpu(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -1215,9 +1206,9 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -1228,7 +1219,7 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [release_seq_cst_i8_shared_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_gpu_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
@@ -1236,7 +1227,7 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB26_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1246,15 +1237,15 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB26_1;
 ; SM70-NEXT:  $L__BB26_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    fence.acq_rel.gpu;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst
     ret i8 %new
 }
 
-define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acq_rel_monotonic_i8_generic(
+define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_monotonic_i8_generic_sys(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -1262,9 +1253,8 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_param_0];
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_generic_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_generic_sys_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -1275,7 +1265,7 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [acquire_monotonic_i8_generic_sys_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
@@ -1283,7 +1273,7 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB27_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1296,12 +1286,12 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    fence.acq_rel.sys;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic
     ret i8 %new
 }
 
-define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acq_rel_monotonic_i8_global(
+define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_monotonic_i8_generic_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -1309,9 +1299,8 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_param_0];
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_generic_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_generic_cta_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -1322,15 +1311,15 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_monotonic_i8_global_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [acquire_monotonic_i8_generic_cta_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB28_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB28_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1340,15 +1329,15 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB28_1;
 ; SM70-NEXT:  $L__BB28_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    fence.acq_rel.cta;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic
     ret i8 %new
 }
 
-define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acq_rel_monotonic_i8_shared(
+define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_monotonic_i8_generic_gpu(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -1356,9 +1345,8 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_param_0];
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_generic_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_generic_gpu_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -1369,15 +1357,15 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [acquire_monotonic_i8_generic_gpu_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB29_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB29_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1387,15 +1375,15 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB29_1;
 ; SM70-NEXT:  $L__BB29_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    fence.acq_rel.gpu;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic
     ret i8 %new
 }
 
-define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acq_rel_acquire_i8_generic(
+define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_monotonic_i8_global_sys(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -1403,9 +1391,8 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_param_0];
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_global_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_global_sys_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -1416,15 +1403,15 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_acquire_i8_generic_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [acquire_monotonic_i8_global_sys_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.b32 %r16, [%rd1];
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB30_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB30_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1437,12 +1424,12 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    fence.acq_rel.sys;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic
     ret i8 %new
 }
 
-define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acq_rel_acquire_i8_global(
+define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_monotonic_i8_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -1450,9 +1437,8 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0];
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_global_cta_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -1463,7 +1449,7 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_acquire_i8_global_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [acquire_monotonic_i8_global_cta_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
@@ -1471,7 +1457,7 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB31_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1481,15 +1467,15 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB31_1;
 ; SM70-NEXT:  $L__BB31_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    fence.acq_rel.cta;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic
     ret i8 %new
 }
 
-define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acq_rel_acquire_i8_shared(
+define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_monotonic_i8_global_gpu(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -1497,9 +1483,8 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_param_0];
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_global_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_global_gpu_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -1510,15 +1495,15 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_acquire_i8_shared_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [acquire_monotonic_i8_global_gpu_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB32_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB32_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1528,15 +1513,15 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB32_1;
 ; SM70-NEXT:  $L__BB32_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    fence.acq_rel.gpu;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic
     ret i8 %new
 }
 
-define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acq_rel_seq_cst_i8_generic(
+define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_monotonic_i8_shared_sys(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -1544,9 +1529,8 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_param_0];
-; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_shared_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_shared_sys_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -1557,15 +1541,15 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [acquire_monotonic_i8_shared_sys_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.b32 %r16, [%rd1];
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB33_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB33_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1578,12 +1562,12 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    fence.acq_rel.sys;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic
     ret i8 %new
 }
 
-define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acq_rel_seq_cst_i8_global(
+define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_monotonic_i8_shared_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -1591,9 +1575,8 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_shared_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_shared_cta_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -1604,15 +1587,15 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [acquire_monotonic_i8_shared_cta_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB34_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB34_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1622,15 +1605,15 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB34_1;
 ; SM70-NEXT:  $L__BB34_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    fence.acq_rel.cta;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic
     ret i8 %new
 }
 
-define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acq_rel_seq_cst_i8_shared(
+define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_monotonic_i8_shared_gpu(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -1638,9 +1621,8 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_shared_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_shared_gpu_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -1651,7 +1633,7 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [acquire_monotonic_i8_shared_gpu_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
@@ -1659,7 +1641,7 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB35_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1669,15 +1651,15 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB35_1;
 ; SM70-NEXT:  $L__BB35_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    fence.acq_rel.gpu;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic
     ret i8 %new
 }
 
-define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: seq_cst_monotonic_i8_generic(
+define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_acquire_i8_generic_sys(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -1685,9 +1667,8 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_param_0];
-; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_generic_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_generic_sys_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -1698,7 +1679,7 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [acquire_acquire_i8_generic_sys_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
@@ -1706,7 +1687,7 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB36_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1719,12 +1700,12 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    fence.acq_rel.sys;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire acquire
     ret i8 %new
 }
 
-define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: seq_cst_monotonic_i8_global(
+define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_acquire_i8_generic_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -1732,9 +1713,8 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_generic_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_generic_cta_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -1745,15 +1725,15 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_monotonic_i8_global_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [acquire_acquire_i8_generic_cta_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB37_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB37_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1763,15 +1743,15 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB37_1;
 ; SM70-NEXT:  $L__BB37_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    fence.acq_rel.cta;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire
     ret i8 %new
 }
 
-define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: seq_cst_monotonic_i8_shared(
+define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_acquire_i8_generic_gpu(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -1779,9 +1759,8 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_generic_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_generic_gpu_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -1792,15 +1771,15 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [acquire_acquire_i8_generic_gpu_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB38_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB38_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1810,15 +1789,15 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB38_1;
 ; SM70-NEXT:  $L__BB38_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    fence.acq_rel.gpu;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire
     ret i8 %new
 }
 
-define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: seq_cst_acquire_i8_generic(
+define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_acquire_i8_global_sys(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -1826,9 +1805,8 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_param_0];
-; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_global_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_global_sys_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -1839,15 +1817,15 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_acquire_i8_generic_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [acquire_acquire_i8_global_sys_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.b32 %r16, [%rd1];
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB39_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB39_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1860,12 +1838,12 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    fence.acq_rel.sys;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire
     ret i8 %new
 }
 
-define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: seq_cst_acquire_i8_global(
+define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_acquire_i8_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -1873,9 +1851,8 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_global_cta_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -1886,7 +1863,7 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_acquire_i8_global_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [acquire_acquire_i8_global_cta_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
@@ -1894,7 +1871,7 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB40_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1904,15 +1881,15 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB40_1;
 ; SM70-NEXT:  $L__BB40_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    fence.acq_rel.cta;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire
     ret i8 %new
 }
 
-define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: seq_cst_acquire_i8_shared(
+define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_acquire_i8_global_gpu(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -1920,9 +1897,8 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_global_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_global_gpu_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -1933,15 +1909,15 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_acquire_i8_shared_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [acquire_acquire_i8_global_gpu_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB41_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB41_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1951,15 +1927,15 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB41_1;
 ; SM70-NEXT:  $L__BB41_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    fence.acq_rel.gpu;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire
     ret i8 %new
 }
 
-define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: seq_cst_seq_cst_i8_generic(
+define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_acquire_i8_shared_sys(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -1967,9 +1943,8 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_param_0];
-; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_shared_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_shared_sys_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -1980,15 +1955,15 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [acquire_acquire_i8_shared_sys_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.b32 %r16, [%rd1];
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB42_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB42_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -2001,12 +1976,12 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    fence.acq_rel.sys;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire
     ret i8 %new
 }
 
-define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: seq_cst_seq_cst_i8_global(
+define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_acquire_i8_shared_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -2014,9 +1989,8 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_shared_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_shared_cta_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -2027,15 +2001,15 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [acquire_acquire_i8_shared_cta_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB43_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB43_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -2045,15 +2019,15 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB43_1;
 ; SM70-NEXT:  $L__BB43_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    fence.acq_rel.cta;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire
     ret i8 %new
 }
 
-define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: seq_cst_seq_cst_i8_shared(
+define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_acquire_i8_shared_gpu(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -2061,9 +2035,8 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_shared_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_shared_gpu_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -2074,7 +2047,7 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [acquire_acquire_i8_shared_gpu_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
@@ -2082,7 +2055,7 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB44_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -2092,3589 +2065,14968 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB44_1;
 ; SM70-NEXT:  $L__BB44_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    fence.acq_rel.gpu;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire
     ret i8 %new
 }
 
-define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: monotonic_monotonic_i16_generic(
+define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_seq_cst_i8_generic_sys(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<21>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_generic_param_1];
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    ld.param.b8 %r15, [acquire_seq_cst_i8_generic_sys_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB45_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB45_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB45_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB45_1;
 ; SM70-NEXT:  $L__BB45_3: // %partword.cmpxchg.end
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic
-    ret i16 %new
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst
+    ret i8 %new
 }
 
-define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: monotonic_monotonic_i16_global(
+define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_seq_cst_i8_generic_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<21>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_global_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_global_param_1];
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    ld.param.b8 %r15, [acquire_seq_cst_i8_generic_cta_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB46_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB46_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB46_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB46_1;
 ; SM70-NEXT:  $L__BB46_3: // %partword.cmpxchg.end
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic monotonic
-    ret i16 %new
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst
+    ret i8 %new
 }
 
-define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: monotonic_monotonic_i16_shared(
+define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_seq_cst_i8_generic_gpu(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<21>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_shared_param_1];
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    ld.param.b8 %r15, [acquire_seq_cst_i8_generic_gpu_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB47_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB47_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB47_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB47_1;
 ; SM70-NEXT:  $L__BB47_3: // %partword.cmpxchg.end
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic monotonic
-    ret i16 %new
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst
+    ret i8 %new
 }
 
-define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: monotonic_acquire_i16_generic(
+define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_seq_cst_i8_global_sys(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<21>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_generic_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_global_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_global_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_generic_param_1];
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    ld.param.b8 %r15, [acquire_seq_cst_i8_global_sys_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB48_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB48_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB48_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB48_1;
 ; SM70-NEXT:  $L__BB48_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic acquire
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst
+    ret i8 %new
 }
 
-define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: monotonic_acquire_i16_global(
+define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_seq_cst_i8_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<21>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_global_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_global_param_1];
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    ld.param.b8 %r15, [acquire_seq_cst_i8_global_cta_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB49_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB49_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB49_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB49_1;
 ; SM70-NEXT:  $L__BB49_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic acquire
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst
+    ret i8 %new
 }
 
-define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: monotonic_acquire_i16_shared(
+define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_seq_cst_i8_global_gpu(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<21>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_shared_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_global_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_global_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_shared_param_1];
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    ld.param.b8 %r15, [acquire_seq_cst_i8_global_gpu_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB50_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB50_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB50_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB50_1;
 ; SM70-NEXT:  $L__BB50_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic acquire
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst
+    ret i8 %new
 }
 
-define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: monotonic_seq_cst_i16_generic(
+define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_seq_cst_i8_shared_sys(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<21>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_sys_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    ld.param.b8 %r15, [acquire_seq_cst_i8_shared_sys_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB51_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB51_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB51_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB51_1;
 ; SM70-NEXT:  $L__BB51_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic seq_cst
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst
+    ret i8 %new
 }
 
-define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: monotonic_seq_cst_i16_global(
+define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_seq_cst_i8_shared_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<21>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_global_param_1];
+; SM70-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    ld.param.b8 %r15, [acquire_seq_cst_i8_shared_cta_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB52_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB52_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB52_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB52_1;
 ; SM70-NEXT:  $L__BB52_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic seq_cst
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst
+    ret i8 %new
 }
 
-define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: monotonic_seq_cst_i16_shared(
+define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_seq_cst_i8_shared_gpu(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<21>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_param_1];
+; SM70-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    ld.param.b8 %r15, [acquire_seq_cst_i8_shared_gpu_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB53_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB53_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB53_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB53_1;
 ; SM70-NEXT:  $L__BB53_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic seq_cst
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst
+    ret i8 %new
 }
 
-define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acquire_monotonic_i16_generic(
+define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_monotonic_i8_generic_sys(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<21>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_generic_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_generic_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_generic_sys_param_0];
+; SM70-NEXT:    fence.acq_rel.sys;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_generic_param_1];
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    ld.param.b8 %r15, [release_monotonic_i8_generic_sys_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB54_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB54_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB54_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB54_1;
 ; SM70-NEXT:  $L__BB54_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire monotonic
-    ret i16 %new
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release monotonic
+    ret i8 %new
 }
 
-define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acquire_monotonic_i16_global(
+define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_monotonic_i8_generic_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<21>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_global_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_generic_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_generic_cta_param_0];
+; SM70-NEXT:    fence.acq_rel.cta;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_global_param_1];
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    ld.param.b8 %r15, [release_monotonic_i8_generic_cta_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB55_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB55_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB55_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB55_1;
 ; SM70-NEXT:  $L__BB55_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire monotonic
-    ret i16 %new
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release monotonic
+    ret i8 %new
 }
 
-define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acquire_monotonic_i16_shared(
+define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_monotonic_i8_generic_gpu(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<21>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_shared_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_generic_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_generic_gpu_param_0];
+; SM70-NEXT:    fence.acq_rel.gpu;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_shared_param_1];
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    ld.param.b8 %r15, [release_monotonic_i8_generic_gpu_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB56_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB56_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB56_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB56_1;
 ; SM70-NEXT:  $L__BB56_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire monotonic
-    ret i16 %new
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release monotonic
+    ret i8 %new
 }
 
-define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acquire_acquire_i16_generic(
+define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_monotonic_i8_global_sys(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<21>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_generic_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_global_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_global_sys_param_0];
+; SM70-NEXT:    fence.acq_rel.sys;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_generic_param_1];
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    ld.param.b8 %r15, [release_monotonic_i8_global_sys_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB57_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB57_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB57_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB57_1;
 ; SM70-NEXT:  $L__BB57_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire acquire
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release monotonic
+    ret i8 %new
 }
 
-define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acquire_acquire_i16_global(
+define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_monotonic_i8_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<21>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_global_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_global_cta_param_0];
+; SM70-NEXT:    fence.acq_rel.cta;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_global_param_1];
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    ld.param.b8 %r15, [release_monotonic_i8_global_cta_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB58_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB58_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB58_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB58_1;
 ; SM70-NEXT:  $L__BB58_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire acquire
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic
+    ret i8 %new
 }
 
-define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acquire_acquire_i16_shared(
+define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_monotonic_i8_global_gpu(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<21>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_shared_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_global_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_global_gpu_param_0];
+; SM70-NEXT:    fence.acq_rel.gpu;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_shared_param_1];
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    ld.param.b8 %r15, [release_monotonic_i8_global_gpu_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB59_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB59_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB59_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB59_1;
 ; SM70-NEXT:  $L__BB59_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire acquire
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic
+    ret i8 %new
 }
 
-define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acquire_seq_cst_i16_generic(
+define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_monotonic_i8_shared_sys(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<21>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_generic_param_1];
+; SM70-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_shared_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_shared_sys_param_0];
+; SM70-NEXT:    fence.acq_rel.sys;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    ld.param.b8 %r15, [release_monotonic_i8_shared_sys_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB60_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB60_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB60_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB60_1;
 ; SM70-NEXT:  $L__BB60_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire seq_cst
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release monotonic
+    ret i8 %new
 }
 
-define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acquire_seq_cst_i16_global(
+define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_monotonic_i8_shared_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<21>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_global_param_1];
+; SM70-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_shared_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_shared_cta_param_0];
+; SM70-NEXT:    fence.acq_rel.cta;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    ld.param.b8 %r15, [release_monotonic_i8_shared_cta_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB61_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB61_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB61_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB61_1;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB61_1;
 ; SM70-NEXT:  $L__BB61_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire seq_cst
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic
+    ret i8 %new
 }
 
-define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acquire_seq_cst_i16_shared(
+define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_monotonic_i8_shared_gpu(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<21>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_shared_param_1];
+; SM70-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_shared_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_shared_gpu_param_0];
+; SM70-NEXT:    fence.acq_rel.gpu;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    ld.param.b8 %r15, [release_monotonic_i8_shared_gpu_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB62_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB62_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB62_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB62_1;
 ; SM70-NEXT:  $L__BB62_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire seq_cst
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic
+    ret i8 %new
 }
 
-define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: release_monotonic_i16_generic(
+define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_acquire_i8_generic_sys(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<21>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_generic_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_generic_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_generic_sys_param_0];
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_generic_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    ld.param.b8 %r15, [release_acquire_i8_generic_sys_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB63_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB63_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB63_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB63_1;
 ; SM70-NEXT:  $L__BB63_3: // %partword.cmpxchg.end
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release monotonic
-    ret i16 %new
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release acquire
+    ret i8 %new
 }
 
-define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: release_monotonic_i16_global(
+define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_acquire_i8_generic_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<21>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_global_param_0];
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_global_param_1];
+; SM70-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_generic_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_generic_cta_param_0];
+; SM70-NEXT:    fence.acq_rel.cta;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    ld.param.b8 %r15, [release_acquire_i8_generic_cta_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB64_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB64_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB64_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB64_1;
 ; SM70-NEXT:  $L__BB64_3: // %partword.cmpxchg.end
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release monotonic
-    ret i16 %new
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release acquire
+    ret i8 %new
 }
 
-define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: release_monotonic_i16_shared(
+define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_acquire_i8_generic_gpu(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<21>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_shared_param_0];
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_shared_param_1];
+; SM70-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_generic_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_generic_gpu_param_0];
+; SM70-NEXT:    fence.acq_rel.gpu;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    ld.param.b8 %r15, [release_acquire_i8_generic_gpu_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB65_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB65_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB65_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB65_1;
 ; SM70-NEXT:  $L__BB65_3: // %partword.cmpxchg.end
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release monotonic
-    ret i16 %new
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release acquire
+    ret i8 %new
 }
 
-define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: release_acquire_i16_generic(
+define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_acquire_i8_global_sys(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<21>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_generic_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_global_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_global_sys_param_0];
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [release_acquire_i16_generic_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    ld.param.b8 %r15, [release_acquire_i8_global_sys_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB66_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB66_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB66_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB66_1;
 ; SM70-NEXT:  $L__BB66_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release acquire
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release acquire
+    ret i8 %new
 }
 
-define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: release_acquire_i16_global(
+define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_acquire_i8_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<21>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_global_param_0];
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [release_acquire_i16_global_param_1];
+; SM70-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_global_cta_param_0];
+; SM70-NEXT:    fence.acq_rel.cta;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    ld.param.b8 %r15, [release_acquire_i8_global_cta_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB67_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB67_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB67_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB67_1;
 ; SM70-NEXT:  $L__BB67_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release acquire
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire
+    ret i8 %new
 }
 
-define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: release_acquire_i16_shared(
+define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_acquire_i8_global_gpu(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<21>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_shared_param_0];
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [release_acquire_i16_shared_param_1];
+; SM70-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_global_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_global_gpu_param_0];
+; SM70-NEXT:    fence.acq_rel.gpu;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    ld.param.b8 %r15, [release_acquire_i8_global_gpu_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB68_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB68_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB68_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB68_1;
 ; SM70-NEXT:  $L__BB68_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release acquire
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release acquire
+    ret i8 %new
 }
 
-define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: release_seq_cst_i16_generic(
+define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_acquire_i8_shared_sys(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<21>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_generic_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_generic_param_1];
+; SM70-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_shared_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_shared_sys_param_0];
+; SM70-NEXT:    fence.acq_rel.sys;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    ld.param.b8 %r15, [release_acquire_i8_shared_sys_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB69_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB69_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB69_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB69_1;
 ; SM70-NEXT:  $L__BB69_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release seq_cst
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release acquire
+    ret i8 %new
 }
 
-define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: release_seq_cst_i16_global(
+define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_acquire_i8_shared_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<21>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_global_param_1];
+; SM70-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_shared_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_shared_cta_param_0];
+; SM70-NEXT:    fence.acq_rel.cta;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    ld.param.b8 %r15, [release_acquire_i8_shared_cta_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB70_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB70_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB70_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB70_1;
 ; SM70-NEXT:  $L__BB70_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release seq_cst
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release acquire
+    ret i8 %new
 }
 
-define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: release_seq_cst_i16_shared(
+define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_acquire_i8_shared_gpu(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<21>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_shared_param_1];
+; SM70-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_shared_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_shared_gpu_param_0];
+; SM70-NEXT:    fence.acq_rel.gpu;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    ld.param.b8 %r15, [release_acquire_i8_shared_gpu_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB71_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB71_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB71_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB71_1;
 ; SM70-NEXT:  $L__BB71_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release seq_cst
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release acquire
+    ret i8 %new
 }
 
-define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acq_rel_monotonic_i16_generic(
+define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_seq_cst_i8_generic_sys(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<21>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_param_0];
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_param_1];
+; SM70-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_generic_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_generic_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    ld.param.b8 %r15, [release_seq_cst_i8_generic_sys_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB72_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB72_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB72_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB72_1;
 ; SM70-NEXT:  $L__BB72_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel monotonic
-    ret i16 %new
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release seq_cst
+    ret i8 %new
 }
 
-define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acq_rel_monotonic_i16_global(
+define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_seq_cst_i8_generic_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<21>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_param_0];
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_global_param_1];
+; SM70-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_generic_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_generic_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    ld.param.b8 %r15, [release_seq_cst_i8_generic_cta_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB73_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB73_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB73_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB73_1;
 ; SM70-NEXT:  $L__BB73_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel monotonic
-    ret i16 %new
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst
+    ret i8 %new
 }
 
-define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acq_rel_monotonic_i16_shared(
+define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_seq_cst_i8_generic_gpu(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<21>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_param_0];
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_param_1];
+; SM70-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_generic_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_generic_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    ld.param.b8 %r15, [release_seq_cst_i8_generic_gpu_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB74_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB74_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB74_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB74_1;
 ; SM70-NEXT:  $L__BB74_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel monotonic
-    ret i16 %new
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst
+    ret i8 %new
 }
 
-define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acq_rel_acquire_i16_generic(
+define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_seq_cst_i8_global_sys(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<21>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_param_0];
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_generic_param_1];
+; SM70-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_global_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_global_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    ld.param.b8 %r15, [release_seq_cst_i8_global_sys_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB75_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB75_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB75_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB75_1;
 ; SM70-NEXT:  $L__BB75_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel acquire
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst
+    ret i8 %new
 }
 
-define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acq_rel_acquire_i16_global(
+define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_seq_cst_i8_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<21>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_global_param_0];
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_global_param_1];
+; SM70-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    ld.param.b8 %r15, [release_seq_cst_i8_global_cta_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB76_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB76_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB76_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB76_1;
 ; SM70-NEXT:  $L__BB76_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel acquire
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst
+    ret i8 %new
 }
 
-define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acq_rel_acquire_i16_shared(
+define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_seq_cst_i8_global_gpu(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<21>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_param_0];
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_shared_param_1];
+; SM70-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_global_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_global_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    ld.param.b8 %r15, [release_seq_cst_i8_global_gpu_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB77_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB77_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB77_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB77_1;
 ; SM70-NEXT:  $L__BB77_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel acquire
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst
+    ret i8 %new
 }
 
-define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acq_rel_seq_cst_i16_generic(
+define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_seq_cst_i8_shared_sys(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<21>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_shared_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_shared_sys_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    ld.param.b8 %r15, [release_seq_cst_i8_shared_sys_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB78_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB78_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB78_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB78_1;
 ; SM70-NEXT:  $L__BB78_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel seq_cst
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst
+    ret i8 %new
 }
 
-define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acq_rel_seq_cst_i16_global(
+define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_seq_cst_i8_shared_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<21>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_param_1];
+; SM70-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_shared_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_shared_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    ld.param.b8 %r15, [release_seq_cst_i8_shared_cta_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB79_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB79_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB79_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB79_1;
 ; SM70-NEXT:  $L__BB79_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel seq_cst
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst
+    ret i8 %new
 }
 
-define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acq_rel_seq_cst_i16_shared(
+define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_seq_cst_i8_shared_gpu(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<21>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_param_1];
+; SM70-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_shared_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_shared_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    ld.param.b8 %r15, [release_seq_cst_i8_shared_gpu_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB80_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB80_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB80_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB80_1;
 ; SM70-NEXT:  $L__BB80_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel seq_cst
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst
+    ret i8 %new
 }
 
-define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: seq_cst_monotonic_i16_generic(
+define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_monotonic_i8_generic_sys(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<21>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_param_1];
+; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_sys_param_0];
+; SM70-NEXT:    fence.acq_rel.sys;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_sys_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB81_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB81_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB81_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB81_1;
 ; SM70-NEXT:  $L__BB81_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst monotonic
-    ret i16 %new
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic
+    ret i8 %new
 }
 
-define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: seq_cst_monotonic_i16_global(
+define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_monotonic_i8_generic_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<21>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_global_param_1];
+; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_cta_param_0];
+; SM70-NEXT:    fence.acq_rel.cta;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_cta_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB82_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB82_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB82_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB82_1;
 ; SM70-NEXT:  $L__BB82_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst monotonic
-    ret i16 %new
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic
+    ret i8 %new
 }
 
-define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: seq_cst_monotonic_i16_shared(
+define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_monotonic_i8_generic_gpu(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<21>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_param_1];
+; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_gpu_param_0];
+; SM70-NEXT:    fence.acq_rel.gpu;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_gpu_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB83_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB83_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB83_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB83_1;
 ; SM70-NEXT:  $L__BB83_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst monotonic
-    ret i16 %new
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic
+    ret i8 %new
 }
 
-define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: seq_cst_acquire_i16_generic(
+define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_monotonic_i8_global_sys(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<21>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_generic_param_1];
+; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_sys_param_0];
+; SM70-NEXT:    fence.acq_rel.sys;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_monotonic_i8_global_sys_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB84_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB84_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB84_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB84_1;
 ; SM70-NEXT:  $L__BB84_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst acquire
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic
+    ret i8 %new
 }
 
-define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: seq_cst_acquire_i16_global(
+define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_monotonic_i8_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<21>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_global_param_1];
+; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0];
+; SM70-NEXT:    fence.acq_rel.cta;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_monotonic_i8_global_cta_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB85_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB85_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB85_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB85_1;
 ; SM70-NEXT:  $L__BB85_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst acquire
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic
+    ret i8 %new
 }
 
-define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: seq_cst_acquire_i16_shared(
+define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_monotonic_i8_global_gpu(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<21>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_shared_param_1];
+; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_gpu_param_0];
+; SM70-NEXT:    fence.acq_rel.gpu;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_monotonic_i8_global_gpu_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB86_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB86_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB86_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB86_1;
 ; SM70-NEXT:  $L__BB86_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst acquire
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic
+    ret i8 %new
 }
 
-define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: seq_cst_seq_cst_i16_generic(
+define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_monotonic_i8_shared_sys(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<21>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_param_1];
+; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_sys_param_0];
+; SM70-NEXT:    fence.acq_rel.sys;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_sys_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB87_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB87_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB87_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB87_1;
 ; SM70-NEXT:  $L__BB87_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic
+    ret i8 %new
 }
 
-define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: seq_cst_seq_cst_i16_global(
+define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_monotonic_i8_shared_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<21>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_param_1];
+; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_cta_param_0];
+; SM70-NEXT:    fence.acq_rel.cta;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_cta_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB88_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB88_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB88_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB88_1;
 ; SM70-NEXT:  $L__BB88_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic
+    ret i8 %new
+}
+
+define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_monotonic_i8_shared_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_gpu_param_0];
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_gpu_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB89_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB89_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB89_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB89_1;
+; SM70-NEXT:  $L__BB89_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic
+    ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_acquire_i8_generic_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_sys_param_0];
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_acquire_i8_generic_sys_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB90_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB90_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB90_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB90_1;
+; SM70-NEXT:  $L__BB90_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire
+    ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_acquire_i8_generic_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0];
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_acquire_i8_generic_cta_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB91_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB91_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB91_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB91_1;
+; SM70-NEXT:  $L__BB91_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
+    ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_acquire_i8_generic_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_gpu_param_0];
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_acquire_i8_generic_gpu_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB92_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB92_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB92_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB92_1;
+; SM70-NEXT:  $L__BB92_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire
+    ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_acquire_i8_global_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_global_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_global_sys_param_0];
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_acquire_i8_global_sys_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB93_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB93_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB93_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB93_1;
+; SM70-NEXT:  $L__BB93_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire
+    ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_acquire_i8_global_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cta_param_0];
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_acquire_i8_global_cta_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB94_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB94_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB94_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB94_1;
+; SM70-NEXT:  $L__BB94_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
+    ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_acquire_i8_global_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_global_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_global_gpu_param_0];
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_acquire_i8_global_gpu_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB95_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB95_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB95_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB95_1;
+; SM70-NEXT:  $L__BB95_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire
+    ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_acquire_i8_shared_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_sys_param_0];
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_acquire_i8_shared_sys_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB96_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB96_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB96_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB96_1;
+; SM70-NEXT:  $L__BB96_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire
+    ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_acquire_i8_shared_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0];
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_acquire_i8_shared_cta_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB97_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB97_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB97_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB97_1;
+; SM70-NEXT:  $L__BB97_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
+    ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_acquire_i8_shared_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_gpu_param_0];
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_acquire_i8_shared_gpu_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB98_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB98_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB98_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB98_1;
+; SM70-NEXT:  $L__BB98_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire
+    ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i8_generic_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_sys_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB99_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB99_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB99_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB99_1;
+; SM70-NEXT:  $L__BB99_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst
+    ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i8_generic_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_cta_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB100_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB100_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB100_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB100_1;
+; SM70-NEXT:  $L__BB100_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst
+    ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i8_generic_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_gpu_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB101_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB101_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB101_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB101_1;
+; SM70-NEXT:  $L__BB101_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst
+    ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i8_global_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_sys_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB102_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB102_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB102_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB102_1;
+; SM70-NEXT:  $L__BB102_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst
+    ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i8_global_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_cta_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB103_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB103_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB103_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB103_1;
+; SM70-NEXT:  $L__BB103_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst
+    ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i8_global_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_gpu_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB104_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB104_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB104_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB104_1;
+; SM70-NEXT:  $L__BB104_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst
+    ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i8_shared_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_sys_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB105_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB105_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB105_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB105_1;
+; SM70-NEXT:  $L__BB105_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst
+    ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i8_shared_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_cta_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB106_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB106_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB106_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB106_1;
+; SM70-NEXT:  $L__BB106_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst
+    ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i8_shared_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_gpu_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB107_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB107_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB107_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB107_1;
+; SM70-NEXT:  $L__BB107_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst
+    ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_monotonic_i8_generic_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_sys_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB108_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB108_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB108_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB108_1;
+; SM70-NEXT:  $L__BB108_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic
+    ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_monotonic_i8_generic_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_cta_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB109_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB109_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB109_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB109_1;
+; SM70-NEXT:  $L__BB109_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic
+    ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_monotonic_i8_generic_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_gpu_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB110_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB110_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB110_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB110_1;
+; SM70-NEXT:  $L__BB110_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic
+    ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_monotonic_i8_global_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_monotonic_i8_global_sys_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB111_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB111_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB111_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB111_1;
+; SM70-NEXT:  $L__BB111_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic
+    ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_monotonic_i8_global_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_monotonic_i8_global_cta_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB112_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB112_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB112_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB112_1;
+; SM70-NEXT:  $L__BB112_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic
+    ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_monotonic_i8_global_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_monotonic_i8_global_gpu_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB113_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB113_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB113_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB113_1;
+; SM70-NEXT:  $L__BB113_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic
+    ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_monotonic_i8_shared_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_sys_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB114_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB114_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB114_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB114_1;
+; SM70-NEXT:  $L__BB114_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic
+    ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_monotonic_i8_shared_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_cta_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB115_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB115_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB115_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB115_1;
+; SM70-NEXT:  $L__BB115_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic
+    ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_monotonic_i8_shared_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_gpu_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB116_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB116_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB116_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB116_1;
+; SM70-NEXT:  $L__BB116_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic
+    ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_acquire_i8_generic_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_acquire_i8_generic_sys_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB117_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB117_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB117_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB117_1;
+; SM70-NEXT:  $L__BB117_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire
+    ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_acquire_i8_generic_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_acquire_i8_generic_cta_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB118_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB118_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB118_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB118_1;
+; SM70-NEXT:  $L__BB118_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire
+    ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_acquire_i8_generic_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_acquire_i8_generic_gpu_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB119_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB119_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB119_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB119_1;
+; SM70-NEXT:  $L__BB119_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire
+    ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_acquire_i8_global_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_global_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_global_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_acquire_i8_global_sys_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB120_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB120_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB120_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB120_1;
+; SM70-NEXT:  $L__BB120_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire
+    ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_acquire_i8_global_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_acquire_i8_global_cta_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB121_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB121_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB121_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB121_1;
+; SM70-NEXT:  $L__BB121_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire
+    ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_acquire_i8_global_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_global_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_global_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_acquire_i8_global_gpu_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB122_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB122_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB122_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB122_1;
+; SM70-NEXT:  $L__BB122_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire
+    ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_acquire_i8_shared_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_acquire_i8_shared_sys_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB123_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB123_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB123_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB123_1;
+; SM70-NEXT:  $L__BB123_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire
+    ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_acquire_i8_shared_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_acquire_i8_shared_cta_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB124_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB124_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB124_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB124_1;
+; SM70-NEXT:  $L__BB124_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire
+    ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_acquire_i8_shared_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_acquire_i8_shared_gpu_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB125_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB125_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB125_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB125_1;
+; SM70-NEXT:  $L__BB125_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire
+    ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i8_generic_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_sys_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB126_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB126_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB126_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB126_1;
+; SM70-NEXT:  $L__BB126_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst
+    ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i8_generic_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_cta_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB127_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB127_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB127_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB127_1;
+; SM70-NEXT:  $L__BB127_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst
+    ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i8_generic_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_gpu_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB128_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB128_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB128_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB128_1;
+; SM70-NEXT:  $L__BB128_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst
+    ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i8_global_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_sys_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB129_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB129_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB129_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB129_1;
+; SM70-NEXT:  $L__BB129_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst
+    ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i8_global_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_cta_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB130_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB130_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB130_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB130_1;
+; SM70-NEXT:  $L__BB130_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst
+    ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i8_global_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_gpu_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB131_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB131_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB131_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB131_1;
+; SM70-NEXT:  $L__BB131_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst
+    ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i8_shared_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_sys_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB132_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB132_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB132_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB132_1;
+; SM70-NEXT:  $L__BB132_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst
+    ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i8_shared_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_cta_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB133_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB133_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB133_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB133_1;
+; SM70-NEXT:  $L__BB133_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst
+    ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i8_shared_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_gpu_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB134_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB134_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB134_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB134_1;
+; SM70-NEXT:  $L__BB134_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst
+    ret i8 %new
+}
+
+define i16 @monotonic_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_monotonic_i16_generic_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_sys_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_generic_sys_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB135_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB135_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB135_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB135_1;
+; SM70-NEXT:  $L__BB135_3: // %partword.cmpxchg.end
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic
+    ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_monotonic_i16_generic_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_cta_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_generic_cta_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB136_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB136_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB136_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB136_1;
+; SM70-NEXT:  $L__BB136_3: // %partword.cmpxchg.end
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic
+    ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_monotonic_i16_generic_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_gpu_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_generic_gpu_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB137_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB137_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB137_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB137_1;
+; SM70-NEXT:  $L__BB137_3: // %partword.cmpxchg.end
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic
+    ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_monotonic_i16_global_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_global_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_global_sys_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_global_sys_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB138_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB138_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB138_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB138_1;
+; SM70-NEXT:  $L__BB138_3: // %partword.cmpxchg.end
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic
+    ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_monotonic_i16_global_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_global_cta_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_global_cta_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB139_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB139_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB139_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB139_1;
+; SM70-NEXT:  $L__BB139_3: // %partword.cmpxchg.end
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic
+    ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_monotonic_i16_global_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_global_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_global_gpu_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_global_gpu_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB140_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB140_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB140_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB140_1;
+; SM70-NEXT:  $L__BB140_3: // %partword.cmpxchg.end
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic
+    ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_monotonic_i16_shared_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_sys_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_shared_sys_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB141_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB141_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB141_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB141_1;
+; SM70-NEXT:  $L__BB141_3: // %partword.cmpxchg.end
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic
+    ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_monotonic_i16_shared_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_cta_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_shared_cta_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB142_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB142_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB142_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB142_1;
+; SM70-NEXT:  $L__BB142_3: // %partword.cmpxchg.end
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic
+    ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_monotonic_i16_shared_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_gpu_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_shared_gpu_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB143_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB143_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB143_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB143_1;
+; SM70-NEXT:  $L__BB143_3: // %partword.cmpxchg.end
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic
+    ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_acquire_i16_generic_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_generic_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_generic_sys_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_generic_sys_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB144_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB144_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB144_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB144_1;
+; SM70-NEXT:  $L__BB144_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire
+    ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_acquire_i16_generic_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_generic_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_generic_cta_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_generic_cta_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB145_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB145_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB145_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB145_1;
+; SM70-NEXT:  $L__BB145_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire
+    ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_acquire_i16_generic_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_generic_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_generic_gpu_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_generic_gpu_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB146_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB146_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB146_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB146_1;
+; SM70-NEXT:  $L__BB146_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire
+    ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_acquire_i16_global_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_global_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_global_sys_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_global_sys_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB147_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB147_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB147_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB147_1;
+; SM70-NEXT:  $L__BB147_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire
+    ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_acquire_i16_global_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_global_cta_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_global_cta_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB148_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB148_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB148_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB148_1;
+; SM70-NEXT:  $L__BB148_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire
+    ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_acquire_i16_global_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_global_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_global_gpu_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_global_gpu_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB149_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB149_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB149_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB149_1;
+; SM70-NEXT:  $L__BB149_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire
+    ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_acquire_i16_shared_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_shared_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_shared_sys_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_shared_sys_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB150_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB150_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB150_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB150_1;
+; SM70-NEXT:  $L__BB150_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire
+    ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_acquire_i16_shared_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_shared_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_shared_cta_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_shared_cta_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB151_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB151_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB151_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB151_1;
+; SM70-NEXT:  $L__BB151_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire
+    ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_acquire_i16_shared_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_shared_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_shared_gpu_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_shared_gpu_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB152_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB152_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB152_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB152_1;
+; SM70-NEXT:  $L__BB152_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire
+    ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_seq_cst_i16_generic_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_sys_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB153_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB153_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB153_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB153_1;
+; SM70-NEXT:  $L__BB153_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst
+    ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_seq_cst_i16_generic_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_cta_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB154_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB154_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB154_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB154_1;
+; SM70-NEXT:  $L__BB154_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst
+    ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_seq_cst_i16_generic_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_gpu_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB155_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB155_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB155_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB155_1;
+; SM70-NEXT:  $L__BB155_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst
+    ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_seq_cst_i16_global_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_global_sys_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB156_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB156_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB156_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB156_1;
+; SM70-NEXT:  $L__BB156_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst
+    ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_seq_cst_i16_global_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_global_cta_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB157_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB157_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB157_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB157_1;
+; SM70-NEXT:  $L__BB157_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst
+    ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_seq_cst_i16_global_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_global_gpu_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB158_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB158_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB158_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB158_1;
+; SM70-NEXT:  $L__BB158_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst
+    ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_seq_cst_i16_shared_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_sys_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB159_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB159_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB159_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB159_1;
+; SM70-NEXT:  $L__BB159_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst
+    ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_seq_cst_i16_shared_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_cta_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB160_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB160_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB160_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB160_1;
+; SM70-NEXT:  $L__BB160_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst
+    ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_seq_cst_i16_shared_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_gpu_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB161_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB161_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB161_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB161_1;
+; SM70-NEXT:  $L__BB161_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst
+    ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_monotonic_i16_generic_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_generic_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_generic_sys_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_generic_sys_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB162_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB162_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB162_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB162_1;
+; SM70-NEXT:  $L__BB162_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic
+    ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_monotonic_i16_generic_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_generic_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_generic_cta_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_generic_cta_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB163_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB163_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB163_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB163_1;
+; SM70-NEXT:  $L__BB163_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic
+    ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_monotonic_i16_generic_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_generic_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_generic_gpu_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_generic_gpu_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB164_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB164_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB164_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB164_1;
+; SM70-NEXT:  $L__BB164_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic
+    ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_monotonic_i16_global_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_global_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_global_sys_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_global_sys_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB165_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB165_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB165_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB165_1;
+; SM70-NEXT:  $L__BB165_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic
+    ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_monotonic_i16_global_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_global_cta_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_global_cta_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB166_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB166_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB166_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB166_1;
+; SM70-NEXT:  $L__BB166_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic
+    ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_monotonic_i16_global_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_global_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_global_gpu_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_global_gpu_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB167_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB167_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB167_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB167_1;
+; SM70-NEXT:  $L__BB167_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic
+    ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_monotonic_i16_shared_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_shared_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_shared_sys_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_shared_sys_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB168_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB168_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB168_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB168_1;
+; SM70-NEXT:  $L__BB168_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic
+    ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_monotonic_i16_shared_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_shared_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_shared_cta_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_shared_cta_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB169_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB169_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB169_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB169_1;
+; SM70-NEXT:  $L__BB169_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic
+    ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_monotonic_i16_shared_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_shared_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_shared_gpu_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_shared_gpu_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB170_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB170_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB170_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB170_1;
+; SM70-NEXT:  $L__BB170_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic
+    ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_acquire_i16_generic_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_generic_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_generic_sys_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_generic_sys_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB171_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB171_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB171_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB171_1;
+; SM70-NEXT:  $L__BB171_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire acquire
+    ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_acquire_i16_generic_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_generic_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_generic_cta_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_generic_cta_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB172_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB172_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB172_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB172_1;
+; SM70-NEXT:  $L__BB172_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire
+    ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_acquire_i16_generic_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_generic_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_generic_gpu_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_generic_gpu_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB173_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB173_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB173_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB173_1;
+; SM70-NEXT:  $L__BB173_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire
+    ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_acquire_i16_global_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_global_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_global_sys_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_global_sys_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB174_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB174_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB174_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB174_1;
+; SM70-NEXT:  $L__BB174_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire acquire
+    ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_acquire_i16_global_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_global_cta_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_global_cta_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB175_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB175_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB175_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB175_1;
+; SM70-NEXT:  $L__BB175_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire
+    ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_acquire_i16_global_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_global_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_global_gpu_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_global_gpu_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB176_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB176_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB176_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB176_1;
+; SM70-NEXT:  $L__BB176_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire
+    ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_acquire_i16_shared_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_shared_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_shared_sys_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_shared_sys_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB177_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB177_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB177_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB177_1;
+; SM70-NEXT:  $L__BB177_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire acquire
+    ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_acquire_i16_shared_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_shared_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_shared_cta_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_shared_cta_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB178_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB178_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB178_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB178_1;
+; SM70-NEXT:  $L__BB178_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire
+    ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_acquire_i16_shared_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_shared_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_shared_gpu_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_shared_gpu_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB179_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB179_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB179_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB179_1;
+; SM70-NEXT:  $L__BB179_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire
+    ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_seq_cst_i16_generic_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_generic_sys_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB180_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB180_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB180_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB180_1;
+; SM70-NEXT:  $L__BB180_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst
+    ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_seq_cst_i16_generic_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_generic_cta_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB181_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB181_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB181_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB181_1;
+; SM70-NEXT:  $L__BB181_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst
+    ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_seq_cst_i16_generic_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_generic_gpu_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB182_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB182_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB182_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB182_1;
+; SM70-NEXT:  $L__BB182_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst
+    ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_seq_cst_i16_global_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_global_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_global_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_global_sys_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB183_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB183_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB183_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB183_1;
+; SM70-NEXT:  $L__BB183_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst
+    ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_seq_cst_i16_global_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_global_cta_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB184_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB184_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB184_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB184_1;
+; SM70-NEXT:  $L__BB184_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst
+    ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_seq_cst_i16_global_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_global_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_global_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_global_gpu_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB185_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB185_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB185_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB185_1;
+; SM70-NEXT:  $L__BB185_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst
+    ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_seq_cst_i16_shared_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_shared_sys_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB186_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB186_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB186_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB186_1;
+; SM70-NEXT:  $L__BB186_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst
+    ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_seq_cst_i16_shared_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_shared_cta_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB187_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB187_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB187_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB187_1;
+; SM70-NEXT:  $L__BB187_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst
+    ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_seq_cst_i16_shared_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_shared_gpu_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB188_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB188_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB188_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB188_1;
+; SM70-NEXT:  $L__BB188_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst
+    ret i16 %new
+}
+
+define i16 @release_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_monotonic_i16_generic_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_generic_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_generic_sys_param_0];
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_generic_sys_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB189_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB189_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB189_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB189_1;
+; SM70-NEXT:  $L__BB189_3: // %partword.cmpxchg.end
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release monotonic
+    ret i16 %new
+}
+
+define i16 @release_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_monotonic_i16_generic_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_generic_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_generic_cta_param_0];
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_generic_cta_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB190_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB190_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB190_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB190_1;
+; SM70-NEXT:  $L__BB190_3: // %partword.cmpxchg.end
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release monotonic
+    ret i16 %new
+}
+
+define i16 @release_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_monotonic_i16_generic_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_generic_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_generic_gpu_param_0];
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_generic_gpu_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB191_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB191_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB191_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB191_1;
+; SM70-NEXT:  $L__BB191_3: // %partword.cmpxchg.end
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release monotonic
+    ret i16 %new
+}
+
+define i16 @release_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_monotonic_i16_global_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_global_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_global_sys_param_0];
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_global_sys_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB192_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB192_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB192_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB192_1;
+; SM70-NEXT:  $L__BB192_3: // %partword.cmpxchg.end
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release monotonic
+    ret i16 %new
+}
+
+define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_monotonic_i16_global_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_global_cta_param_0];
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_global_cta_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB193_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB193_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB193_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB193_1;
+; SM70-NEXT:  $L__BB193_3: // %partword.cmpxchg.end
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic
+    ret i16 %new
+}
+
+define i16 @release_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_monotonic_i16_global_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_global_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_global_gpu_param_0];
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_global_gpu_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB194_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB194_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB194_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB194_1;
+; SM70-NEXT:  $L__BB194_3: // %partword.cmpxchg.end
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic
+    ret i16 %new
+}
+
+define i16 @release_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_monotonic_i16_shared_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_shared_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_shared_sys_param_0];
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_shared_sys_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB195_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB195_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB195_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB195_1;
+; SM70-NEXT:  $L__BB195_3: // %partword.cmpxchg.end
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release monotonic
+    ret i16 %new
+}
+
+define i16 @release_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_monotonic_i16_shared_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_shared_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_shared_cta_param_0];
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_shared_cta_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB196_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB196_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB196_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB196_1;
+; SM70-NEXT:  $L__BB196_3: // %partword.cmpxchg.end
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic
+    ret i16 %new
+}
+
+define i16 @release_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_monotonic_i16_shared_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_shared_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_shared_gpu_param_0];
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_shared_gpu_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB197_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB197_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB197_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB197_1;
+; SM70-NEXT:  $L__BB197_3: // %partword.cmpxchg.end
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic
+    ret i16 %new
+}
+
+define i16 @release_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_acquire_i16_generic_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_generic_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_generic_sys_param_0];
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.b16 %r9, [release_acquire_i16_generic_sys_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB198_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB198_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB198_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB198_1;
+; SM70-NEXT:  $L__BB198_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release acquire
+    ret i16 %new
+}
+
+define i16 @release_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_acquire_i16_generic_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_generic_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_generic_cta_param_0];
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    ld.param.b16 %r9, [release_acquire_i16_generic_cta_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB199_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB199_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB199_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB199_1;
+; SM70-NEXT:  $L__BB199_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release acquire
+    ret i16 %new
+}
+
+define i16 @release_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_acquire_i16_generic_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_generic_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_generic_gpu_param_0];
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    ld.param.b16 %r9, [release_acquire_i16_generic_gpu_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB200_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB200_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB200_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB200_1;
+; SM70-NEXT:  $L__BB200_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release acquire
+    ret i16 %new
+}
+
+define i16 @release_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_acquire_i16_global_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_global_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_global_sys_param_0];
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.b16 %r9, [release_acquire_i16_global_sys_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB201_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB201_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB201_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB201_1;
+; SM70-NEXT:  $L__BB201_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release acquire
+    ret i16 %new
+}
+
+define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_acquire_i16_global_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_global_cta_param_0];
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    ld.param.b16 %r9, [release_acquire_i16_global_cta_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB202_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB202_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB202_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB202_1;
+; SM70-NEXT:  $L__BB202_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release acquire
+    ret i16 %new
+}
+
+define i16 @release_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_acquire_i16_global_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_global_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_global_gpu_param_0];
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    ld.param.b16 %r9, [release_acquire_i16_global_gpu_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB203_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB203_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB203_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB203_1;
+; SM70-NEXT:  $L__BB203_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release acquire
+    ret i16 %new
+}
+
+define i16 @release_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_acquire_i16_shared_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_shared_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_shared_sys_param_0];
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.b16 %r9, [release_acquire_i16_shared_sys_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB204_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB204_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB204_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB204_1;
+; SM70-NEXT:  $L__BB204_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release acquire
+    ret i16 %new
+}
+
+define i16 @release_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_acquire_i16_shared_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_shared_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_shared_cta_param_0];
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    ld.param.b16 %r9, [release_acquire_i16_shared_cta_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB205_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB205_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB205_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB205_1;
+; SM70-NEXT:  $L__BB205_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release acquire
+    ret i16 %new
+}
+
+define i16 @release_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_acquire_i16_shared_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_shared_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_shared_gpu_param_0];
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    ld.param.b16 %r9, [release_acquire_i16_shared_gpu_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB206_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB206_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB206_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB206_1;
+; SM70-NEXT:  $L__BB206_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release acquire
+    ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_seq_cst_i16_generic_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_generic_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_generic_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_generic_sys_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB207_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB207_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB207_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB207_1;
+; SM70-NEXT:  $L__BB207_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release seq_cst
+    ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_seq_cst_i16_generic_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_generic_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_generic_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_generic_cta_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB208_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB208_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB208_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB208_1;
+; SM70-NEXT:  $L__BB208_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst
+    ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_seq_cst_i16_generic_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_generic_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_generic_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_generic_gpu_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB209_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB209_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB209_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB209_1;
+; SM70-NEXT:  $L__BB209_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst
+    ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_seq_cst_i16_global_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_global_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_global_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_global_sys_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB210_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB210_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB210_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB210_1;
+; SM70-NEXT:  $L__BB210_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release seq_cst
+    ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_seq_cst_i16_global_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_global_cta_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB211_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB211_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB211_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB211_1;
+; SM70-NEXT:  $L__BB211_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst
+    ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_seq_cst_i16_global_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_global_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_global_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_global_gpu_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB212_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB212_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB212_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB212_1;
+; SM70-NEXT:  $L__BB212_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst
+    ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_seq_cst_i16_shared_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_shared_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_shared_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_shared_sys_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB213_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB213_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB213_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB213_1;
+; SM70-NEXT:  $L__BB213_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release seq_cst
+    ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_seq_cst_i16_shared_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_shared_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_shared_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_shared_cta_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB214_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB214_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB214_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB214_1;
+; SM70-NEXT:  $L__BB214_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst
+    ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_seq_cst_i16_shared_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_shared_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_shared_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_shared_gpu_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB215_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB215_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB215_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB215_1;
+; SM70-NEXT:  $L__BB215_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst
+    ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_monotonic_i16_generic_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_sys_param_0];
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_sys_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB216_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB216_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB216_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB216_1;
+; SM70-NEXT:  $L__BB216_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic
+    ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_monotonic_i16_generic_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_cta_param_0];
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_cta_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB217_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB217_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB217_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB217_1;
+; SM70-NEXT:  $L__BB217_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic
+    ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_monotonic_i16_generic_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_gpu_param_0];
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_gpu_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB218_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB218_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB218_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB218_1;
+; SM70-NEXT:  $L__BB218_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic
+    ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_monotonic_i16_global_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_sys_param_0];
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_global_sys_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB219_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB219_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB219_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB219_1;
+; SM70-NEXT:  $L__BB219_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic
+    ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_monotonic_i16_global_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0];
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_global_cta_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB220_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB220_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB220_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB220_1;
+; SM70-NEXT:  $L__BB220_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic
+    ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_monotonic_i16_global_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_gpu_param_0];
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_global_gpu_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB221_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB221_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB221_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB221_1;
+; SM70-NEXT:  $L__BB221_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic
+    ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_monotonic_i16_shared_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_sys_param_0];
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_sys_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB222_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB222_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB222_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB222_1;
+; SM70-NEXT:  $L__BB222_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic
+    ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_monotonic_i16_shared_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_cta_param_0];
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_cta_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB223_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB223_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB223_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB223_1;
+; SM70-NEXT:  $L__BB223_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic
+    ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_monotonic_i16_shared_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_gpu_param_0];
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_gpu_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB224_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB224_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB224_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB224_1;
+; SM70-NEXT:  $L__BB224_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic
+    ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_acquire_i16_generic_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_sys_param_0];
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_generic_sys_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB225_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB225_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB225_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB225_1;
+; SM70-NEXT:  $L__BB225_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire
+    ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_acquire_i16_generic_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_cta_param_0];
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_generic_cta_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB226_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB226_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB226_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB226_1;
+; SM70-NEXT:  $L__BB226_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire
+    ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_acquire_i16_generic_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_gpu_param_0];
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_generic_gpu_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB227_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB227_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB227_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB227_1;
+; SM70-NEXT:  $L__BB227_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire
+    ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_acquire_i16_global_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_global_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_global_sys_param_0];
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_global_sys_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB228_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB228_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB228_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB228_1;
+; SM70-NEXT:  $L__BB228_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire
+    ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_acquire_i16_global_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_global_cta_param_0];
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_global_cta_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB229_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB229_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB229_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB229_1;
+; SM70-NEXT:  $L__BB229_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire
+    ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_acquire_i16_global_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_global_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_global_gpu_param_0];
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_global_gpu_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB230_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB230_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB230_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB230_1;
+; SM70-NEXT:  $L__BB230_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire
+    ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_acquire_i16_shared_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_sys_param_0];
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_shared_sys_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB231_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB231_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB231_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB231_1;
+; SM70-NEXT:  $L__BB231_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire
+    ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_acquire_i16_shared_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_cta_param_0];
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_shared_cta_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB232_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB232_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB232_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB232_1;
+; SM70-NEXT:  $L__BB232_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire
+    ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_acquire_i16_shared_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_gpu_param_0];
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_shared_gpu_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB233_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB233_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB233_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB233_1;
+; SM70-NEXT:  $L__BB233_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire
+    ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i16_generic_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_sys_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB234_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB234_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB234_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB234_1;
+; SM70-NEXT:  $L__BB234_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst
+    ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i16_generic_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_cta_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB235_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB235_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB235_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB235_1;
+; SM70-NEXT:  $L__BB235_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst
+    ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i16_generic_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_gpu_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB236_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB236_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB236_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB236_1;
+; SM70-NEXT:  $L__BB236_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst
+    ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i16_global_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_sys_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB237_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB237_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB237_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB237_1;
+; SM70-NEXT:  $L__BB237_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst
+    ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i16_global_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_cta_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB238_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB238_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB238_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB238_1;
+; SM70-NEXT:  $L__BB238_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst
+    ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i16_global_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_gpu_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB239_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB239_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB239_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB239_1;
+; SM70-NEXT:  $L__BB239_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst
+    ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i16_shared_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_sys_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB240_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB240_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB240_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB240_1;
+; SM70-NEXT:  $L__BB240_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst
+    ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i16_shared_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_cta_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB241_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB241_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB241_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB241_1;
+; SM70-NEXT:  $L__BB241_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst
+    ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i16_shared_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_gpu_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB242_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB242_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB242_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB242_1;
+; SM70-NEXT:  $L__BB242_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst
+    ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_monotonic_i16_generic_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_sys_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB243_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB243_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB243_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB243_1;
+; SM70-NEXT:  $L__BB243_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic
+    ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_monotonic_i16_generic_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_cta_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB244_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB244_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB244_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB244_1;
+; SM70-NEXT:  $L__BB244_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic
+    ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_monotonic_i16_generic_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_gpu_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB245_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB245_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB245_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB245_1;
+; SM70-NEXT:  $L__BB245_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic
+    ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_monotonic_i16_global_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_global_sys_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB246_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB246_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB246_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB246_1;
+; SM70-NEXT:  $L__BB246_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic
+    ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_monotonic_i16_global_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_global_cta_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB247_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB247_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB247_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB247_1;
+; SM70-NEXT:  $L__BB247_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic
+    ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_monotonic_i16_global_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_global_gpu_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB248_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB248_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB248_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB248_1;
+; SM70-NEXT:  $L__BB248_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic
+    ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_monotonic_i16_shared_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_sys_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB249_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB249_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB249_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB249_1;
+; SM70-NEXT:  $L__BB249_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic
+    ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_monotonic_i16_shared_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_cta_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB250_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB250_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB250_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB250_1;
+; SM70-NEXT:  $L__BB250_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic
+    ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_monotonic_i16_shared_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_gpu_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB251_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB251_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB251_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB251_1;
+; SM70-NEXT:  $L__BB251_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic
+    ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_acquire_i16_generic_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_generic_sys_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB252_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB252_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB252_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB252_1;
+; SM70-NEXT:  $L__BB252_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire
+    ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_acquire_i16_generic_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_generic_cta_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB253_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB253_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB253_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB253_1;
+; SM70-NEXT:  $L__BB253_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire
+    ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_acquire_i16_generic_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_generic_gpu_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB254_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB254_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB254_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB254_1;
+; SM70-NEXT:  $L__BB254_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire
+    ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_acquire_i16_global_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_global_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_global_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_global_sys_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB255_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB255_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB255_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB255_1;
+; SM70-NEXT:  $L__BB255_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire
+    ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_acquire_i16_global_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_global_cta_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB256_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB256_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB256_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB256_1;
+; SM70-NEXT:  $L__BB256_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire
+    ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_acquire_i16_global_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_global_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_global_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_global_gpu_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB257_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB257_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB257_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB257_1;
+; SM70-NEXT:  $L__BB257_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire
+    ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_acquire_i16_shared_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_shared_sys_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB258_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB258_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB258_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB258_1;
+; SM70-NEXT:  $L__BB258_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire
+    ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_acquire_i16_shared_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_shared_cta_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB259_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB259_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB259_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB259_1;
+; SM70-NEXT:  $L__BB259_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire
+    ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_acquire_i16_shared_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_shared_gpu_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB260_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB260_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB260_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB260_1;
+; SM70-NEXT:  $L__BB260_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire
+    ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i16_generic_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_sys_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB261_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB261_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB261_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB261_1;
+; SM70-NEXT:  $L__BB261_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst
+    ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i16_generic_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_cta_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB262_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB262_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB262_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB262_1;
+; SM70-NEXT:  $L__BB262_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst
+    ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i16_generic_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_gpu_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB263_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB263_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB263_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB263_1;
+; SM70-NEXT:  $L__BB263_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst
+    ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i16_global_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_sys_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB264_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB264_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB264_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB264_1;
+; SM70-NEXT:  $L__BB264_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst
+    ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i16_global_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_cta_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB265_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB265_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB265_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB265_1;
+; SM70-NEXT:  $L__BB265_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst
+    ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i16_global_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_gpu_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB266_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB266_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB266_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB266_1;
+; SM70-NEXT:  $L__BB266_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst
+    ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i16_shared_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_sys_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB267_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB267_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB267_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB267_1;
+; SM70-NEXT:  $L__BB267_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst
+    ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i16_shared_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_cta_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB268_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB268_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB268_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB268_1;
+; SM70-NEXT:  $L__BB268_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst
+    ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i16_shared_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_gpu_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB269_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB269_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB269_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB269_1;
+; SM70-NEXT:  $L__BB269_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst
+    ret i16 %new
+}
+
+define i32 @monotonic_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_monotonic_i32_generic_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_sys_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_generic_sys_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_generic_sys_param_2];
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic
+    ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_monotonic_i32_generic_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_cta_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_generic_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_generic_cta_param_2];
+; SM70-NEXT:    atom.relaxed.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic
+    ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_monotonic_i32_generic_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_gpu_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_generic_gpu_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_generic_gpu_param_2];
+; SM70-NEXT:    atom.relaxed.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic
+    ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_monotonic_i32_global_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_global_sys_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_global_sys_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_global_sys_param_2];
+; SM70-NEXT:    atom.relaxed.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic
+    ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_monotonic_i32_global_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_global_cta_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_global_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_global_cta_param_2];
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic
+    ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_monotonic_i32_global_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_global_gpu_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_global_gpu_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_global_gpu_param_2];
+; SM70-NEXT:    atom.relaxed.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic
+    ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_monotonic_i32_shared_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_sys_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_shared_sys_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_shared_sys_param_2];
+; SM70-NEXT:    atom.relaxed.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic
+    ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_monotonic_i32_shared_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_cta_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_shared_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_shared_cta_param_2];
+; SM70-NEXT:    atom.relaxed.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic
+    ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_monotonic_i32_shared_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_gpu_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_shared_gpu_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_shared_gpu_param_2];
+; SM70-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic
+    ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_acquire_i32_generic_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_generic_sys_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_generic_sys_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_generic_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire
+    ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_acquire_i32_generic_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_generic_cta_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_generic_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_generic_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire
+    ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_acquire_i32_generic_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_generic_gpu_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_generic_gpu_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_generic_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire
+    ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_acquire_i32_global_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_global_sys_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_global_sys_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_global_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire
+    ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_acquire_i32_global_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_global_cta_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_global_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_global_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire
+    ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_acquire_i32_global_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_global_gpu_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_global_gpu_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_global_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire
+    ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_acquire_i32_shared_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_shared_sys_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_shared_sys_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_shared_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire
+    ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_acquire_i32_shared_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_shared_cta_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_shared_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_shared_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire
+    ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_acquire_i32_shared_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_shared_gpu_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_shared_gpu_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_shared_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire
+    ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_seq_cst_i32_generic_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_sys_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst
+    ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_seq_cst_i32_generic_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst
+    ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_seq_cst_i32_generic_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_gpu_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst
+    ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_seq_cst_i32_global_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_global_sys_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_global_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst
+    ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_seq_cst_i32_global_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_global_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_global_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst
+    ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_seq_cst_i32_global_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_global_gpu_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_global_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst
+    ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_seq_cst_i32_shared_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_sys_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst
+    ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_seq_cst_i32_shared_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst
+    ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_seq_cst_i32_shared_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_gpu_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst
+    ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_monotonic_i32_generic_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_generic_sys_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_generic_sys_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_generic_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic
+    ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_monotonic_i32_generic_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_generic_cta_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_generic_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_generic_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic
+    ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_monotonic_i32_generic_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_generic_gpu_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_generic_gpu_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_generic_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic
+    ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_monotonic_i32_global_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_global_sys_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_global_sys_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_global_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic
+    ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_monotonic_i32_global_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_global_cta_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_global_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_global_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic
+    ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_monotonic_i32_global_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_global_gpu_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_global_gpu_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_global_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic
+    ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_monotonic_i32_shared_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_shared_sys_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_shared_sys_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_shared_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic
+    ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_monotonic_i32_shared_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_shared_cta_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_shared_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_shared_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic
+    ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_monotonic_i32_shared_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_shared_gpu_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_shared_gpu_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_shared_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic
+    ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_acquire_i32_generic_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_generic_sys_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_generic_sys_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_generic_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire acquire
+    ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_acquire_i32_generic_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_generic_cta_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_generic_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_generic_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire
+    ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_acquire_i32_generic_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_generic_gpu_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_generic_gpu_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_generic_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire
+    ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_acquire_i32_global_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_global_sys_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_global_sys_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_global_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire acquire
+    ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_acquire_i32_global_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_global_cta_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_global_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_global_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire
+    ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_acquire_i32_global_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_global_gpu_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_global_gpu_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_global_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire
+    ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_acquire_i32_shared_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_shared_sys_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_shared_sys_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_shared_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire acquire
+    ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_acquire_i32_shared_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_shared_cta_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_shared_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_shared_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire
+    ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_acquire_i32_shared_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_shared_gpu_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_shared_gpu_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_shared_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire
+    ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_seq_cst_i32_generic_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_generic_sys_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_generic_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst
+    ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_seq_cst_i32_generic_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_generic_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_generic_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst
+    ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_seq_cst_i32_generic_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_generic_gpu_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_generic_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst
+    ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_seq_cst_i32_global_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_global_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_global_sys_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_global_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst
+    ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_seq_cst_i32_global_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_global_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_global_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst
+    ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_seq_cst_i32_global_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_global_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_global_gpu_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_global_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst
+    ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_seq_cst_i32_shared_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_shared_sys_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_shared_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst
+    ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_seq_cst_i32_shared_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_shared_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_shared_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst
+    ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_seq_cst_i32_shared_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_shared_gpu_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_shared_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst
+    ret i32 %new
+}
+
+define i32 @release_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_monotonic_i32_generic_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_generic_sys_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_generic_sys_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_generic_sys_param_2];
+; SM70-NEXT:    atom.release.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release monotonic
+    ret i32 %new
+}
+
+define i32 @release_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_monotonic_i32_generic_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_generic_cta_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_generic_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_generic_cta_param_2];
+; SM70-NEXT:    atom.release.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release monotonic
+    ret i32 %new
+}
+
+define i32 @release_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_monotonic_i32_generic_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_generic_gpu_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_generic_gpu_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_generic_gpu_param_2];
+; SM70-NEXT:    atom.release.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release monotonic
+    ret i32 %new
+}
+
+define i32 @release_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_monotonic_i32_global_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_global_sys_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_global_sys_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_global_sys_param_2];
+; SM70-NEXT:    atom.release.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release monotonic
+    ret i32 %new
+}
+
+define i32 @release_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_monotonic_i32_global_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_global_cta_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_global_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_global_cta_param_2];
+; SM70-NEXT:    atom.release.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic
+    ret i32 %new
+}
+
+define i32 @release_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_monotonic_i32_global_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_global_gpu_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_global_gpu_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_global_gpu_param_2];
+; SM70-NEXT:    atom.release.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release monotonic
+    ret i32 %new
+}
+
+define i32 @release_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_monotonic_i32_shared_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_shared_sys_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_shared_sys_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_shared_sys_param_2];
+; SM70-NEXT:    atom.release.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release monotonic
+    ret i32 %new
+}
+
+define i32 @release_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_monotonic_i32_shared_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_shared_cta_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_shared_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_shared_cta_param_2];
+; SM70-NEXT:    atom.release.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic
+    ret i32 %new
+}
+
+define i32 @release_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_monotonic_i32_shared_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_shared_gpu_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_shared_gpu_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_shared_gpu_param_2];
+; SM70-NEXT:    atom.release.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release monotonic
+    ret i32 %new
+}
+
+define i32 @release_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_acquire_i32_generic_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_generic_sys_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [release_acquire_i32_generic_sys_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [release_acquire_i32_generic_sys_param_2];
+; SM70-NEXT:    atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release acquire
+    ret i32 %new
+}
+
+define i32 @release_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_acquire_i32_generic_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_generic_cta_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [release_acquire_i32_generic_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [release_acquire_i32_generic_cta_param_2];
+; SM70-NEXT:    atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release acquire
+    ret i32 %new
+}
+
+define i32 @release_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_acquire_i32_generic_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_generic_gpu_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [release_acquire_i32_generic_gpu_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [release_acquire_i32_generic_gpu_param_2];
+; SM70-NEXT:    atom.acq_rel.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release acquire
+    ret i32 %new
+}
+
+define i32 @release_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_acquire_i32_global_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_global_sys_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [release_acquire_i32_global_sys_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [release_acquire_i32_global_sys_param_2];
+; SM70-NEXT:    atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release acquire
+    ret i32 %new
+}
+
+define i32 @release_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_acquire_i32_global_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_global_cta_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [release_acquire_i32_global_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [release_acquire_i32_global_cta_param_2];
+; SM70-NEXT:    atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release acquire
+    ret i32 %new
+}
+
+define i32 @release_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_acquire_i32_global_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_global_gpu_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [release_acquire_i32_global_gpu_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [release_acquire_i32_global_gpu_param_2];
+; SM70-NEXT:    atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release acquire
+    ret i32 %new
+}
+
+define i32 @release_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_acquire_i32_shared_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_shared_sys_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [release_acquire_i32_shared_sys_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [release_acquire_i32_shared_sys_param_2];
+; SM70-NEXT:    atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release acquire
+    ret i32 %new
+}
+
+define i32 @release_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_acquire_i32_shared_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_shared_cta_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [release_acquire_i32_shared_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [release_acquire_i32_shared_cta_param_2];
+; SM70-NEXT:    atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release acquire
+    ret i32 %new
+}
+
+define i32 @release_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_acquire_i32_shared_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_shared_gpu_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [release_acquire_i32_shared_gpu_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [release_acquire_i32_shared_gpu_param_2];
+; SM70-NEXT:    atom.acq_rel.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release acquire
+    ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_seq_cst_i32_generic_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_generic_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_generic_sys_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_generic_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release seq_cst
+    ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_seq_cst_i32_generic_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_generic_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_generic_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_generic_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst
+    ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_seq_cst_i32_generic_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_generic_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_generic_gpu_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_generic_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst
+    ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_seq_cst_i32_global_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_global_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_global_sys_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_global_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release seq_cst
+    ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_seq_cst_i32_global_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_global_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_global_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst
+    ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_seq_cst_i32_global_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_global_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_global_gpu_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_global_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst
+    ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_seq_cst_i32_shared_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_shared_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_shared_sys_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_shared_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release seq_cst
+    ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_seq_cst_i32_shared_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_shared_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_shared_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_shared_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst
+    ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_seq_cst_i32_shared_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_shared_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_shared_gpu_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_shared_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst
+    ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_monotonic_i32_generic_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_sys_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_sys_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_sys_param_2];
+; SM70-NEXT:    atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic
+    ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_monotonic_i32_generic_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_cta_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_cta_param_2];
+; SM70-NEXT:    atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic
+    ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_monotonic_i32_generic_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_gpu_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_gpu_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_gpu_param_2];
+; SM70-NEXT:    atom.acq_rel.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic
+    ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_monotonic_i32_global_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_sys_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_global_sys_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_global_sys_param_2];
+; SM70-NEXT:    atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic
+    ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_monotonic_i32_global_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_cta_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_global_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_global_cta_param_2];
+; SM70-NEXT:    atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic
+    ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_monotonic_i32_global_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_gpu_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_global_gpu_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_global_gpu_param_2];
+; SM70-NEXT:    atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic
+    ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_monotonic_i32_shared_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_sys_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_sys_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_sys_param_2];
+; SM70-NEXT:    atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic
+    ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_monotonic_i32_shared_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_cta_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_cta_param_2];
+; SM70-NEXT:    atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic
+    ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_monotonic_i32_shared_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_gpu_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_gpu_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_gpu_param_2];
+; SM70-NEXT:    atom.acq_rel.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic
+    ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_acquire_i32_generic_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_sys_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_generic_sys_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_generic_sys_param_2];
+; SM70-NEXT:    atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire
+    ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_acquire_i32_generic_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_cta_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_generic_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_generic_cta_param_2];
+; SM70-NEXT:    atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
+    ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_acquire_i32_generic_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_gpu_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_generic_gpu_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_generic_gpu_param_2];
+; SM70-NEXT:    atom.acq_rel.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire
+    ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_acquire_i32_global_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_global_sys_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_global_sys_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_global_sys_param_2];
+; SM70-NEXT:    atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire
+    ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_acquire_i32_global_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_global_cta_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_global_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_global_cta_param_2];
+; SM70-NEXT:    atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
+    ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_acquire_i32_global_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_global_gpu_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_global_gpu_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_global_gpu_param_2];
+; SM70-NEXT:    atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire
+    ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_acquire_i32_shared_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_sys_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_shared_sys_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_shared_sys_param_2];
+; SM70-NEXT:    atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire
+    ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_acquire_i32_shared_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_cta_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_shared_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_shared_cta_param_2];
+; SM70-NEXT:    atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
+    ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_acquire_i32_shared_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_gpu_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_shared_gpu_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_shared_gpu_param_2];
+; SM70-NEXT:    atom.acq_rel.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire
+    ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i32_generic_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_sys_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst
+    ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i32_generic_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst
+    ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i32_generic_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_gpu_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst
+    ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i32_global_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_sys_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst
+    ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i32_global_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst
+    ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i32_global_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_gpu_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst
+    ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i32_shared_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_sys_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst
+    ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i32_shared_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst
+    ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i32_shared_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_gpu_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst
+    ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_monotonic_i32_generic_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_sys_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic
+    ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_monotonic_i32_generic_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic
+    ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_monotonic_i32_generic_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_gpu_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic
+    ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_monotonic_i32_global_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_global_sys_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_global_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic
+    ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_monotonic_i32_global_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_global_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_global_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic
+    ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_monotonic_i32_global_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_global_gpu_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_global_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic
+    ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_monotonic_i32_shared_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_sys_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic
+    ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_monotonic_i32_shared_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic
+    ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_monotonic_i32_shared_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_gpu_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic
+    ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_acquire_i32_generic_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_generic_sys_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_generic_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire
+    ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_acquire_i32_generic_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_generic_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_generic_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire
+    ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_acquire_i32_generic_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_generic_gpu_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_generic_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire
+    ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_acquire_i32_global_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_global_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_global_sys_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_global_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire
+    ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_acquire_i32_global_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_global_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_global_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire
+    ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_acquire_i32_global_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_global_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_global_gpu_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_global_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire
+    ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_acquire_i32_shared_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_shared_sys_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_shared_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire
+    ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_acquire_i32_shared_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_shared_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_shared_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire
+    ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_acquire_i32_shared_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_shared_gpu_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_shared_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire
+    ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i32_generic_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_sys_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst
+    ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i32_generic_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst
+    ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i32_generic_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_gpu_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst
+    ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i32_global_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_sys_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst
+    ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i32_global_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst
+    ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i32_global_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_gpu_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst
+    ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i32_shared_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_sys_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst
+    ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i32_shared_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst
+    ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i32_shared_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_gpu_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst
+    ret i32 %new
+}
+
+define i64 @monotonic_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_monotonic_i64_generic_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_sys_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_sys_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_sys_param_2];
+; SM70-NEXT:    atom.relaxed.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic
+    ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_monotonic_i64_generic_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_cta_param_2];
+; SM70-NEXT:    atom.relaxed.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic
+    ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_monotonic_i64_generic_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_gpu_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_gpu_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_gpu_param_2];
+; SM70-NEXT:    atom.relaxed.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic
+    ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_monotonic_i64_global_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_global_sys_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_global_sys_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_global_sys_param_2];
+; SM70-NEXT:    atom.relaxed.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic
+    ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_monotonic_i64_global_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_global_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_global_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_global_cta_param_2];
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic
+    ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_monotonic_i64_global_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_global_gpu_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_global_gpu_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_global_gpu_param_2];
+; SM70-NEXT:    atom.relaxed.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic
+    ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_monotonic_i64_shared_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_sys_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_sys_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_sys_param_2];
+; SM70-NEXT:    atom.relaxed.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic
+    ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_monotonic_i64_shared_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_cta_param_2];
+; SM70-NEXT:    atom.relaxed.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic
+    ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_monotonic_i64_shared_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_gpu_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_gpu_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_gpu_param_2];
+; SM70-NEXT:    atom.relaxed.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic
+    ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_acquire_i64_generic_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_generic_sys_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_generic_sys_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_generic_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire
+    ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_acquire_i64_generic_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_generic_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_generic_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_generic_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire
+    ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_acquire_i64_generic_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_generic_gpu_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_generic_gpu_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_generic_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire
+    ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_acquire_i64_global_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_global_sys_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_global_sys_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_global_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire
+    ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_acquire_i64_global_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_global_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_global_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_global_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire
+    ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_acquire_i64_global_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_global_gpu_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_global_gpu_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_global_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire
+    ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_acquire_i64_shared_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_shared_sys_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_shared_sys_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_shared_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire
+    ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_acquire_i64_shared_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_shared_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_shared_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_shared_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire
+    ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_acquire_i64_shared_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_shared_gpu_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_shared_gpu_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_shared_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire
+    ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_seq_cst_i64_generic_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_sys_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst
+    ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_seq_cst_i64_generic_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst
+    ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_seq_cst_i64_generic_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_gpu_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst
+    ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_seq_cst_i64_global_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_sys_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst
+    ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_seq_cst_i64_global_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst
+    ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_seq_cst_i64_global_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_gpu_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst
+    ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_seq_cst_i64_shared_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_sys_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst
+    ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_seq_cst_i64_shared_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst
+    ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_seq_cst_i64_shared_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_gpu_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst
+    ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_monotonic_i64_generic_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_generic_sys_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_generic_sys_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_generic_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic
+    ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_monotonic_i64_generic_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_generic_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_generic_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_generic_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic
+    ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_monotonic_i64_generic_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_generic_gpu_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_generic_gpu_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_generic_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic
+    ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_monotonic_i64_global_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_global_sys_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_global_sys_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_global_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst seq_cst
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic
+    ret i64 %new
 }
 
-define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: seq_cst_seq_cst_i16_shared(
+define i64 @acquire_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_monotonic_i64_global_cta(
 ; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
-; SM70-NEXT:  $L__BB89_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
-; SM70-NEXT:    @%p1 bra $L__BB89_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB89_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB89_1;
-; SM70-NEXT:  $L__BB89_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_global_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_global_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_global_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst seq_cst
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic
+    ret i64 %new
 }
 
-define i32 @monotonic_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: monotonic_monotonic_i32_generic(
+define i64 @acquire_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_monotonic_i64_global_gpu(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_generic_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_generic_param_2];
-; SM70-NEXT:    atom.relaxed.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_global_gpu_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_global_gpu_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_global_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic
+    ret i64 %new
 }
 
-define i32 @monotonic_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: monotonic_monotonic_i32_global(
+define i64 @acquire_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_monotonic_i64_shared_sys(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_global_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_global_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_global_param_2];
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_shared_sys_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_shared_sys_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_shared_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic monotonic
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic
+    ret i64 %new
 }
 
-define i32 @monotonic_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: monotonic_monotonic_i32_shared(
+define i64 @acquire_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_monotonic_i64_shared_cta(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_shared_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_shared_param_2];
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_shared_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_shared_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_shared_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic monotonic
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic
+    ret i64 %new
 }
 
-define i32 @monotonic_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: monotonic_acquire_i32_generic(
+define i64 @acquire_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_monotonic_i64_shared_gpu(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_generic_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_generic_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_generic_param_2];
-; SM70-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_shared_gpu_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_shared_gpu_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_shared_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic acquire
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic
+    ret i64 %new
 }
 
-define i32 @monotonic_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: monotonic_acquire_i32_global(
+define i64 @acquire_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_acquire_i64_generic_sys(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_global_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_global_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_global_param_2];
-; SM70-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_generic_sys_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_generic_sys_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_generic_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic acquire
-    ret i32 %new
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire acquire
+    ret i64 %new
 }
 
-define i32 @monotonic_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: monotonic_acquire_i32_shared(
+define i64 @acquire_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_acquire_i64_generic_cta(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_shared_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_shared_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_shared_param_2];
-; SM70-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_generic_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_generic_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_generic_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic acquire
-    ret i32 %new
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire
+    ret i64 %new
 }
 
-define i32 @monotonic_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: monotonic_seq_cst_i32_generic(
+define i64 @acquire_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_acquire_i64_generic_gpu(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_generic_gpu_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_generic_gpu_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_generic_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire
+    ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_acquire_i64_global_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_global_sys_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_global_sys_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_global_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire acquire
+    ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_acquire_i64_global_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_global_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_global_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_global_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire
+    ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_acquire_i64_global_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_global_gpu_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_global_gpu_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_global_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire
+    ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_acquire_i64_shared_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_shared_sys_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_shared_sys_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_shared_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire acquire
+    ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_acquire_i64_shared_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_shared_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_shared_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_shared_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire
+    ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_acquire_i64_shared_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_shared_gpu_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_shared_gpu_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_shared_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire
+    ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_seq_cst_i64_generic_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_sys_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_param_2];
-; SM70-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_sys_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic seq_cst
-    ret i32 %new
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst
+    ret i64 %new
 }
 
-define i32 @monotonic_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: monotonic_seq_cst_i32_global(
+define i64 @acquire_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_seq_cst_i64_generic_cta(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst
+    ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_seq_cst_i64_generic_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_gpu_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst
+    ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_seq_cst_i64_global_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_global_sys_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_global_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_global_param_2];
-; SM70-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_global_sys_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_global_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic seq_cst
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst
+    ret i64 %new
 }
 
-define i32 @monotonic_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: monotonic_seq_cst_i32_shared(
+define i64 @acquire_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_seq_cst_i64_global_cta(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_global_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_global_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst
+    ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_seq_cst_i64_global_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_global_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_global_gpu_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_global_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst
+    ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_seq_cst_i64_shared_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_sys_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_param_2];
-; SM70-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_sys_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic seq_cst
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst
+    ret i64 %new
 }
 
-define i32 @acquire_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acquire_monotonic_i32_generic(
+define i64 @acquire_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_seq_cst_i64_shared_cta(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_generic_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_generic_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_generic_param_2];
-; SM70-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire monotonic
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst
+    ret i64 %new
 }
 
-define i32 @acquire_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acquire_monotonic_i32_global(
+define i64 @acquire_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_seq_cst_i64_shared_gpu(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_global_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_global_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_global_param_2];
-; SM70-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_gpu_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire monotonic
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst
+    ret i64 %new
 }
 
-define i32 @acquire_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acquire_monotonic_i32_shared(
+define i64 @release_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_monotonic_i64_generic_sys(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_shared_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_shared_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_shared_param_2];
-; SM70-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_generic_sys_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_generic_sys_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_generic_sys_param_2];
+; SM70-NEXT:    atom.release.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire monotonic
-    ret i32 %new
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release monotonic
+    ret i64 %new
 }
 
-define i32 @acquire_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acquire_acquire_i32_generic(
+define i64 @release_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_monotonic_i64_generic_cta(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_generic_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_generic_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_generic_param_2];
-; SM70-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_generic_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_generic_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_generic_cta_param_2];
+; SM70-NEXT:    atom.release.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire acquire
-    ret i32 %new
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release monotonic
+    ret i64 %new
 }
 
-define i32 @acquire_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acquire_acquire_i32_global(
+define i64 @release_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_monotonic_i64_generic_gpu(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_global_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_global_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_global_param_2];
-; SM70-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_generic_gpu_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_generic_gpu_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_generic_gpu_param_2];
+; SM70-NEXT:    atom.release.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire acquire
-    ret i32 %new
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release monotonic
+    ret i64 %new
 }
 
-define i32 @acquire_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acquire_acquire_i32_shared(
+define i64 @release_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_monotonic_i64_global_sys(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_global_sys_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_global_sys_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_global_sys_param_2];
+; SM70-NEXT:    atom.release.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release monotonic
+    ret i64 %new
+}
+
+define i64 @release_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_monotonic_i64_global_cta(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_global_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_global_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_global_cta_param_2];
+; SM70-NEXT:    atom.release.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic
+    ret i64 %new
+}
+
+define i64 @release_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_monotonic_i64_global_gpu(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_shared_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_shared_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_shared_param_2];
-; SM70-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_global_gpu_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_global_gpu_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_global_gpu_param_2];
+; SM70-NEXT:    atom.release.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire acquire
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release monotonic
+    ret i64 %new
 }
 
-define i32 @acquire_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acquire_seq_cst_i32_generic(
+define i64 @release_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_monotonic_i64_shared_sys(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_generic_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_generic_param_2];
-; SM70-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_shared_sys_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_shared_sys_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_shared_sys_param_2];
+; SM70-NEXT:    atom.release.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire seq_cst
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release monotonic
+    ret i64 %new
 }
 
-define i32 @acquire_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acquire_seq_cst_i32_global(
+define i64 @release_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_monotonic_i64_shared_cta(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_global_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_global_param_2];
-; SM70-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_shared_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_shared_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_shared_cta_param_2];
+; SM70-NEXT:    atom.release.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire seq_cst
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic
+    ret i64 %new
 }
 
-define i32 @acquire_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acquire_seq_cst_i32_shared(
+define i64 @release_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_monotonic_i64_shared_gpu(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_shared_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_shared_param_2];
-; SM70-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_shared_gpu_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_shared_gpu_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_shared_gpu_param_2];
+; SM70-NEXT:    atom.release.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire seq_cst
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release monotonic
+    ret i64 %new
 }
 
-define i32 @release_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: release_monotonic_i32_generic(
+define i64 @release_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_acquire_i64_generic_sys(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_generic_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_generic_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_generic_param_2];
-; SM70-NEXT:    atom.release.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_generic_sys_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_generic_sys_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_generic_sys_param_2];
+; SM70-NEXT:    atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release monotonic
-    ret i32 %new
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release acquire
+    ret i64 %new
 }
 
-define i32 @release_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: release_monotonic_i32_global(
+define i64 @release_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_acquire_i64_generic_cta(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_global_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_global_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_global_param_2];
-; SM70-NEXT:    atom.release.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_generic_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_generic_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_generic_cta_param_2];
+; SM70-NEXT:    atom.acq_rel.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release monotonic
-    ret i32 %new
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release acquire
+    ret i64 %new
 }
 
-define i32 @release_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: release_monotonic_i32_shared(
+define i64 @release_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_acquire_i64_generic_gpu(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_shared_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_shared_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_shared_param_2];
-; SM70-NEXT:    atom.release.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_generic_gpu_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_generic_gpu_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_generic_gpu_param_2];
+; SM70-NEXT:    atom.acq_rel.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release monotonic
-    ret i32 %new
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release acquire
+    ret i64 %new
 }
 
-define i32 @release_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: release_acquire_i32_generic(
+define i64 @release_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_acquire_i64_global_sys(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_generic_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [release_acquire_i32_generic_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [release_acquire_i32_generic_param_2];
-; SM70-NEXT:    atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_global_sys_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_global_sys_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_global_sys_param_2];
+; SM70-NEXT:    atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release acquire
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release acquire
+    ret i64 %new
 }
 
-define i32 @release_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: release_acquire_i32_global(
+define i64 @release_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_acquire_i64_global_cta(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_global_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [release_acquire_i32_global_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [release_acquire_i32_global_param_2];
-; SM70-NEXT:    atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_global_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_global_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_global_cta_param_2];
+; SM70-NEXT:    atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release acquire
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release acquire
+    ret i64 %new
 }
 
-define i32 @release_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: release_acquire_i32_shared(
+define i64 @release_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_acquire_i64_global_gpu(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_shared_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [release_acquire_i32_shared_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [release_acquire_i32_shared_param_2];
-; SM70-NEXT:    atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_global_gpu_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_global_gpu_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_global_gpu_param_2];
+; SM70-NEXT:    atom.acq_rel.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release acquire
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release acquire
+    ret i64 %new
 }
 
-define i32 @release_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: release_seq_cst_i32_generic(
+define i64 @release_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_acquire_i64_shared_sys(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_generic_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_generic_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_generic_param_2];
-; SM70-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_shared_sys_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_shared_sys_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_shared_sys_param_2];
+; SM70-NEXT:    atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release seq_cst
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release acquire
+    ret i64 %new
 }
 
-define i32 @release_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: release_seq_cst_i32_global(
+define i64 @release_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_acquire_i64_shared_cta(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_global_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_global_param_2];
-; SM70-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_shared_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_shared_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_shared_cta_param_2];
+; SM70-NEXT:    atom.acq_rel.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release seq_cst
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release acquire
+    ret i64 %new
 }
 
-define i32 @release_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: release_seq_cst_i32_shared(
+define i64 @release_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_acquire_i64_shared_gpu(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_shared_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_shared_param_2];
-; SM70-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_shared_gpu_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_shared_gpu_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_shared_gpu_param_2];
+; SM70-NEXT:    atom.acq_rel.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release seq_cst
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release acquire
+    ret i64 %new
 }
 
-define i32 @acq_rel_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acq_rel_monotonic_i32_generic(
+define i64 @release_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_seq_cst_i64_generic_sys(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_param_2];
-; SM70-NEXT:    atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_generic_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_generic_sys_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_generic_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel monotonic
-    ret i32 %new
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release seq_cst
+    ret i64 %new
 }
 
-define i32 @acq_rel_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acq_rel_monotonic_i32_global(
+define i64 @release_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_seq_cst_i64_generic_cta(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_global_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_global_param_2];
-; SM70-NEXT:    atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_generic_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_generic_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_generic_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel monotonic
-    ret i32 %new
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst
+    ret i64 %new
 }
 
-define i32 @acq_rel_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acq_rel_monotonic_i32_shared(
+define i64 @release_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_seq_cst_i64_generic_gpu(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_param_2];
-; SM70-NEXT:    atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_generic_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_generic_gpu_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_generic_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel monotonic
-    ret i32 %new
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst
+    ret i64 %new
 }
 
-define i32 @acq_rel_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acq_rel_acquire_i32_generic(
+define i64 @release_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_seq_cst_i64_global_sys(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_generic_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_generic_param_2];
-; SM70-NEXT:    atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_global_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_global_sys_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_global_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel acquire
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release seq_cst
+    ret i64 %new
 }
 
-define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acq_rel_acquire_i32_global(
+define i64 @release_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_seq_cst_i64_global_cta(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_global_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_global_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_global_param_2];
-; SM70-NEXT:    atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_global_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_global_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst
+    ret i64 %new
 }
 
-define i32 @acq_rel_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acq_rel_acquire_i32_shared(
+define i64 @release_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_seq_cst_i64_global_gpu(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_shared_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_shared_param_2];
-; SM70-NEXT:    atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_global_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_global_gpu_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_global_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel acquire
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst
+    ret i64 %new
 }
 
-define i32 @acq_rel_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acq_rel_seq_cst_i32_generic(
+define i64 @release_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_seq_cst_i64_shared_sys(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_shared_sys_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_param_2];
-; SM70-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_shared_sys_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_shared_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel seq_cst
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release seq_cst
+    ret i64 %new
 }
 
-define i32 @acq_rel_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acq_rel_seq_cst_i32_global(
+define i64 @release_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_seq_cst_i64_shared_cta(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_param_2];
-; SM70-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_shared_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_shared_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_shared_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel seq_cst
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst
+    ret i64 %new
 }
 
-define i32 @acq_rel_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acq_rel_seq_cst_i32_shared(
+define i64 @release_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_seq_cst_i64_shared_gpu(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_param_2];
-; SM70-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_shared_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_shared_gpu_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_shared_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel seq_cst
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst
+    ret i64 %new
 }
 
-define i32 @seq_cst_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: seq_cst_monotonic_i32_generic(
+define i64 @acq_rel_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_monotonic_i64_generic_sys(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_param_2];
-; SM70-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_sys_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_sys_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_sys_param_2];
+; SM70-NEXT:    atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst monotonic
-    ret i32 %new
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic
+    ret i64 %new
 }
 
-define i32 @seq_cst_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: seq_cst_monotonic_i32_global(
+define i64 @acq_rel_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_monotonic_i64_generic_cta(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_global_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_global_param_2];
-; SM70-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_cta_param_2];
+; SM70-NEXT:    atom.acq_rel.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst monotonic
-    ret i32 %new
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic
+    ret i64 %new
 }
 
-define i32 @seq_cst_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: seq_cst_monotonic_i32_shared(
+define i64 @acq_rel_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_monotonic_i64_generic_gpu(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_param_2];
-; SM70-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_gpu_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_gpu_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_gpu_param_2];
+; SM70-NEXT:    atom.acq_rel.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst monotonic
-    ret i32 %new
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic
+    ret i64 %new
 }
 
-define i32 @seq_cst_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: seq_cst_acquire_i32_generic(
+define i64 @acq_rel_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_monotonic_i64_global_sys(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_generic_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_generic_param_2];
-; SM70-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_sys_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_sys_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_sys_param_2];
+; SM70-NEXT:    atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst acquire
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic
+    ret i64 %new
 }
 
-define i32 @seq_cst_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: seq_cst_acquire_i32_global(
+define i64 @acq_rel_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_monotonic_i64_global_cta(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_global_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_global_param_2];
-; SM70-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_cta_param_2];
+; SM70-NEXT:    atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst acquire
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic
+    ret i64 %new
 }
 
-define i32 @seq_cst_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: seq_cst_acquire_i32_shared(
+define i64 @acq_rel_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_monotonic_i64_global_gpu(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_shared_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_shared_param_2];
-; SM70-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_gpu_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_gpu_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_gpu_param_2];
+; SM70-NEXT:    atom.acq_rel.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst acquire
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic
+    ret i64 %new
 }
 
-define i32 @seq_cst_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: seq_cst_seq_cst_i32_generic(
+define i64 @acq_rel_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_monotonic_i64_shared_sys(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_param_2];
-; SM70-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_sys_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_sys_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_sys_param_2];
+; SM70-NEXT:    atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic
+    ret i64 %new
 }
 
-define i32 @seq_cst_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: seq_cst_seq_cst_i32_global(
+define i64 @acq_rel_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_monotonic_i64_shared_cta(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_param_2];
-; SM70-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_cta_param_2];
+; SM70-NEXT:    atom.acq_rel.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst seq_cst
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic
+    ret i64 %new
 }
 
-define i32 @seq_cst_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: seq_cst_seq_cst_i32_shared(
+define i64 @acq_rel_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_monotonic_i64_shared_gpu(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_param_2];
-; SM70-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_gpu_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_gpu_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_gpu_param_2];
+; SM70-NEXT:    atom.acq_rel.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst seq_cst
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic
+    ret i64 %new
 }
 
-define i64 @monotonic_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: monotonic_monotonic_i64_generic(
+define i64 @acq_rel_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_acquire_i64_generic_sys(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_param_2];
-; SM70-NEXT:    atom.relaxed.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_sys_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_sys_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_sys_param_2];
+; SM70-NEXT:    atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire
     ret i64 %new
 }
 
-define i64 @monotonic_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: monotonic_monotonic_i64_global(
+define i64 @acq_rel_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_acquire_i64_generic_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_global_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_global_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_global_param_2];
-; SM70-NEXT:    atom.relaxed.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_cta_param_2];
+; SM70-NEXT:    atom.acq_rel.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic monotonic
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire
     ret i64 %new
 }
 
-define i64 @monotonic_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: monotonic_monotonic_i64_shared(
+define i64 @acq_rel_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_acquire_i64_generic_gpu(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_param_2];
-; SM70-NEXT:    atom.relaxed.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_gpu_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_gpu_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_gpu_param_2];
+; SM70-NEXT:    atom.acq_rel.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic monotonic
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire
     ret i64 %new
 }
 
-define i64 @monotonic_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: monotonic_acquire_i64_generic(
+define i64 @acq_rel_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_acquire_i64_global_sys(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_generic_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_generic_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_generic_param_2];
-; SM70-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_global_sys_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_global_sys_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_global_sys_param_2];
+; SM70-NEXT:    atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire
     ret i64 %new
 }
 
-define i64 @monotonic_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: monotonic_acquire_i64_global(
+define i64 @acq_rel_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_acquire_i64_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_global_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_global_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_global_param_2];
-; SM70-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_global_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_global_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_global_cta_param_2];
+; SM70-NEXT:    atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire
     ret i64 %new
 }
 
-define i64 @monotonic_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: monotonic_acquire_i64_shared(
+define i64 @acq_rel_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_acquire_i64_global_gpu(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_shared_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_shared_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_shared_param_2];
-; SM70-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_global_gpu_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_global_gpu_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_global_gpu_param_2];
+; SM70-NEXT:    atom.acq_rel.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire
     ret i64 %new
 }
 
-define i64 @monotonic_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: monotonic_seq_cst_i64_generic(
+define i64 @acq_rel_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_acquire_i64_shared_sys(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_param_2];
-; SM70-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_sys_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_sys_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_sys_param_2];
+; SM70-NEXT:    atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire
     ret i64 %new
 }
 
-define i64 @monotonic_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: monotonic_seq_cst_i64_global(
+define i64 @acq_rel_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_acquire_i64_shared_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_param_2];
-; SM70-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_cta_param_2];
+; SM70-NEXT:    atom.acq_rel.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire
     ret i64 %new
 }
 
-define i64 @monotonic_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: monotonic_seq_cst_i64_shared(
+define i64 @acq_rel_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_acquire_i64_shared_gpu(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_param_2];
-; SM70-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_gpu_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_gpu_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_gpu_param_2];
+; SM70-NEXT:    atom.acq_rel.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire
     ret i64 %new
 }
 
-define i64 @acquire_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acquire_monotonic_i64_generic(
+define i64 @acq_rel_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i64_generic_sys(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_generic_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_generic_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_generic_param_2];
-; SM70-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_sys_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire monotonic
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst
     ret i64 %new
 }
 
-define i64 @acquire_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acquire_monotonic_i64_global(
+define i64 @acq_rel_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i64_generic_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_global_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_global_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_global_param_2];
-; SM70-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire monotonic
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst
     ret i64 %new
 }
 
-define i64 @acquire_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acquire_monotonic_i64_shared(
+define i64 @acq_rel_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i64_generic_gpu(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_shared_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_shared_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_shared_param_2];
-; SM70-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_gpu_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire monotonic
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst
     ret i64 %new
 }
 
-define i64 @acquire_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acquire_acquire_i64_generic(
+define i64 @acq_rel_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i64_global_sys(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_generic_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_generic_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_generic_param_2];
-; SM70-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_sys_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst
     ret i64 %new
 }
 
-define i64 @acquire_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acquire_acquire_i64_global(
+define i64 @acq_rel_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i64_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_global_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_global_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_global_param_2];
-; SM70-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst
     ret i64 %new
 }
 
-define i64 @acquire_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acquire_acquire_i64_shared(
+define i64 @acq_rel_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i64_global_gpu(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_shared_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_shared_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_shared_param_2];
-; SM70-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_gpu_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst
     ret i64 %new
 }
 
-define i64 @acquire_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acquire_seq_cst_i64_generic(
+define i64 @acq_rel_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i64_shared_sys(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_sys_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_param_2];
-; SM70-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_sys_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst
     ret i64 %new
 }
 
-define i64 @acquire_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acquire_seq_cst_i64_global(
+define i64 @acq_rel_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i64_shared_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_global_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_global_param_2];
-; SM70-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst
     ret i64 %new
 }
 
-define i64 @acquire_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acquire_seq_cst_i64_shared(
+define i64 @acq_rel_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i64_shared_gpu(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_param_2];
-; SM70-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_gpu_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst
     ret i64 %new
 }
 
-define i64 @release_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: release_monotonic_i64_generic(
+define i64 @seq_cst_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_monotonic_i64_generic_sys(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_generic_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_generic_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_generic_param_2];
-; SM70-NEXT:    atom.release.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_sys_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release monotonic
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic
     ret i64 %new
 }
 
-define i64 @release_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: release_monotonic_i64_global(
+define i64 @seq_cst_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_monotonic_i64_generic_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_global_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_global_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_global_param_2];
-; SM70-NEXT:    atom.release.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release monotonic
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic
     ret i64 %new
 }
 
-define i64 @release_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: release_monotonic_i64_shared(
+define i64 @seq_cst_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_monotonic_i64_generic_gpu(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_shared_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_shared_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_shared_param_2];
-; SM70-NEXT:    atom.release.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_gpu_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release monotonic
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic
     ret i64 %new
 }
 
-define i64 @release_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: release_acquire_i64_generic(
+define i64 @seq_cst_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_monotonic_i64_global_sys(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_generic_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_generic_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_generic_param_2];
-; SM70-NEXT:    atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_sys_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic
     ret i64 %new
 }
 
-define i64 @release_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: release_acquire_i64_global(
+define i64 @seq_cst_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_monotonic_i64_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_global_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_global_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_global_param_2];
-; SM70-NEXT:    atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic
     ret i64 %new
 }
 
-define i64 @release_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: release_acquire_i64_shared(
+define i64 @seq_cst_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_monotonic_i64_global_gpu(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_shared_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_shared_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_shared_param_2];
-; SM70-NEXT:    atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_gpu_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic
     ret i64 %new
 }
 
-define i64 @release_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: release_seq_cst_i64_generic(
+define i64 @seq_cst_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_monotonic_i64_shared_sys(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_generic_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_sys_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_generic_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_generic_param_2];
-; SM70-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_sys_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic
     ret i64 %new
 }
 
-define i64 @release_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: release_seq_cst_i64_global(
+define i64 @seq_cst_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_monotonic_i64_shared_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_global_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_global_param_2];
-; SM70-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic
     ret i64 %new
 }
 
-define i64 @release_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: release_seq_cst_i64_shared(
+define i64 @seq_cst_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_monotonic_i64_shared_gpu(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_shared_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_shared_param_2];
-; SM70-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_gpu_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic
     ret i64 %new
 }
 
-define i64 @acq_rel_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acq_rel_monotonic_i64_generic(
+define i64 @seq_cst_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_acquire_i64_generic_sys(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_param_2];
-; SM70-NEXT:    atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_sys_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel monotonic
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire
     ret i64 %new
 }
 
-define i64 @acq_rel_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acq_rel_monotonic_i64_global(
+define i64 @seq_cst_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_acquire_i64_generic_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_param_2];
-; SM70-NEXT:    atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel monotonic
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire
     ret i64 %new
 }
 
-define i64 @acq_rel_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acq_rel_monotonic_i64_shared(
+define i64 @seq_cst_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_acquire_i64_generic_gpu(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_param_2];
-; SM70-NEXT:    atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_gpu_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel monotonic
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire
     ret i64 %new
 }
 
-define i64 @acq_rel_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acq_rel_acquire_i64_generic(
+define i64 @seq_cst_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_acquire_i64_global_sys(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_param_2];
-; SM70-NEXT:    atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_global_sys_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_global_sys_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_global_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire
     ret i64 %new
 }
 
-define i64 @acq_rel_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acq_rel_acquire_i64_global(
+define i64 @seq_cst_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_acquire_i64_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_global_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_global_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_global_param_2];
-; SM70-NEXT:    atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_global_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_global_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire
     ret i64 %new
 }
 
-define i64 @acq_rel_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acq_rel_acquire_i64_shared(
+define i64 @seq_cst_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_acquire_i64_global_gpu(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_param_2];
-; SM70-NEXT:    atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_global_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_global_gpu_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_global_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire
     ret i64 %new
 }
 
-define i64 @acq_rel_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acq_rel_seq_cst_i64_generic(
+define i64 @seq_cst_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_acquire_i64_shared_sys(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_sys_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_param_2];
-; SM70-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_sys_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire
     ret i64 %new
 }
 
-define i64 @acq_rel_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acq_rel_seq_cst_i64_global(
+define i64 @seq_cst_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_acquire_i64_shared_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_param_2];
-; SM70-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire
     ret i64 %new
 }
 
-define i64 @acq_rel_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acq_rel_seq_cst_i64_shared(
+define i64 @seq_cst_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_acquire_i64_shared_gpu(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_param_2];
-; SM70-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_gpu_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire
     ret i64 %new
 }
 
-define i64 @seq_cst_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: seq_cst_monotonic_i64_generic(
+define i64 @seq_cst_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i64_generic_sys(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_sys_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_param_2];
-; SM70-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_sys_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst monotonic
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst
     ret i64 %new
 }
 
-define i64 @seq_cst_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: seq_cst_monotonic_i64_global(
+define i64 @seq_cst_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i64_generic_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_param_2];
-; SM70-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst monotonic
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst
     ret i64 %new
 }
 
-define i64 @seq_cst_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: seq_cst_monotonic_i64_shared(
+define i64 @seq_cst_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i64_generic_gpu(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_param_2];
-; SM70-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_gpu_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst monotonic
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst
     ret i64 %new
 }
 
-define i64 @seq_cst_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: seq_cst_acquire_i64_generic(
+define i64 @seq_cst_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i64_global_sys(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_sys_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_param_2];
-; SM70-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_sys_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst
     ret i64 %new
 }
 
-define i64 @seq_cst_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: seq_cst_acquire_i64_global(
+define i64 @seq_cst_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i64_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_global_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_global_param_2];
-; SM70-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst
     ret i64 %new
 }
 
-define i64 @seq_cst_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: seq_cst_acquire_i64_shared(
+define i64 @seq_cst_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i64_global_gpu(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_param_2];
-; SM70-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_gpu_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst
     ret i64 %new
 }
 
-define i64 @seq_cst_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: seq_cst_seq_cst_i64_generic(
+define i64 @seq_cst_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i64_shared_sys(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_sys_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_param_2];
-; SM70-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_sys_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_sys_param_2];
+; SM70-NEXT:    atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst
     ret i64 %new
 }
 
-define i64 @seq_cst_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: seq_cst_seq_cst_i64_global(
+define i64 @seq_cst_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i64_shared_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_param_2];
-; SM70-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst
     ret i64 %new
 }
 
-define i64 @seq_cst_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: seq_cst_seq_cst_i64_shared(
+define i64 @seq_cst_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i64_shared_gpu(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_param_2];
-; SM70-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_gpu_param_0];
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_gpu_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_gpu_param_2];
+; SM70-NEXT:    atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst
     ret i64 %new
 }
 
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll
index f289c3cf3d509..a3e84de5a257b 100644
--- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll
@@ -2,8 +2,8 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | FileCheck %s --check-prefix=SM90
 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify -arch=sm_90 %}
 
-define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: monotonic_monotonic_i8_generic(
+define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_monotonic_i8_generic_sys(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -11,8 +11,8 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_param_0];
+; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_sys_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -23,7 +23,7 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [monotonic_monotonic_i8_generic_param_1];
+; SM90-NEXT:    ld.param.u8 %r15, [monotonic_monotonic_i8_generic_sys_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM90-NEXT:    ld.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
@@ -31,7 +31,7 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB0_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -43,12 +43,12 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:  $L__BB0_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic
     ret i8 %new
 }
 
-define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: monotonic_monotonic_i8_global(
+define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_monotonic_i8_generic_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -56,8 +56,8 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_global_param_0];
+; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_cta_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -68,15 +68,15 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [monotonic_monotonic_i8_global_param_1];
+; SM90-NEXT:    ld.param.u8 %r15, [monotonic_monotonic_i8_generic_cta_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB1_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB1_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -88,12 +88,12 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM90-NEXT:  $L__BB1_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic
     ret i8 %new
 }
 
-define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: monotonic_monotonic_i8_shared(
+define i8 @monotonic_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_monotonic_i8_generic_cluster(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -101,8 +101,8 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_param_0];
+; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_cluster_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -113,15 +113,15 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [monotonic_monotonic_i8_shared_param_1];
+; SM90-NEXT:    ld.param.u8 %r15, [monotonic_monotonic_i8_generic_cluster_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB2_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB2_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -133,12 +133,12 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne
 ; SM90-NEXT:  $L__BB2_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic monotonic
     ret i8 %new
 }
 
-define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: monotonic_acquire_i8_generic(
+define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_monotonic_i8_generic_gpu(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -146,8 +146,8 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_generic_param_0];
+; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_gpu_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -158,7 +158,7 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [monotonic_acquire_i8_generic_param_1];
+; SM90-NEXT:    ld.param.u8 %r15, [monotonic_monotonic_i8_generic_gpu_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM90-NEXT:    ld.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
@@ -166,7 +166,7 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB3_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -176,15 +176,14 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB3_1;
 ; SM90-NEXT:  $L__BB3_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic
     ret i8 %new
 }
 
-define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: monotonic_acquire_i8_global(
+define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_monotonic_i8_global_sys(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -192,8 +191,8 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_global_param_0];
+; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_monotonic_i8_global_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i8_global_sys_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -204,7 +203,7 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [monotonic_acquire_i8_global_param_1];
+; SM90-NEXT:    ld.param.u8 %r15, [monotonic_monotonic_i8_global_sys_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
@@ -212,7 +211,7 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB4_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -222,15 +221,14 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB4_1;
 ; SM90-NEXT:  $L__BB4_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic
     ret i8 %new
 }
 
-define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: monotonic_acquire_i8_shared(
+define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_monotonic_i8_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -238,8 +236,8 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_shared_param_0];
+; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_monotonic_i8_global_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i8_global_cta_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -250,15 +248,15 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [monotonic_acquire_i8_shared_param_1];
+; SM90-NEXT:    ld.param.u8 %r15, [monotonic_monotonic_i8_global_cta_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB5_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB5_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -268,15 +266,14 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB5_1;
 ; SM90-NEXT:  $L__BB5_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic
     ret i8 %new
 }
 
-define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: monotonic_seq_cst_i8_generic(
+define i8 @monotonic_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_monotonic_i8_global_cluster(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -284,9 +281,8 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_monotonic_i8_global_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i8_global_cluster_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -297,15 +293,15 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_param_1];
+; SM90-NEXT:    ld.param.u8 %r15, [monotonic_monotonic_i8_global_cluster_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.b32 %r16, [%rd1];
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB6_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB6_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -315,15 +311,14 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB6_1;
 ; SM90-NEXT:  $L__BB6_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic monotonic
     ret i8 %new
 }
 
-define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: monotonic_seq_cst_i8_global(
+define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_monotonic_i8_global_gpu(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -331,9 +326,8 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_monotonic_i8_global_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i8_global_gpu_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -344,7 +338,7 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [monotonic_seq_cst_i8_global_param_1];
+; SM90-NEXT:    ld.param.u8 %r15, [monotonic_monotonic_i8_global_gpu_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
@@ -352,7 +346,7 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB7_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -362,15 +356,14 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB7_1;
 ; SM90-NEXT:  $L__BB7_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic
     ret i8 %new
 }
 
-define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: monotonic_seq_cst_i8_shared(
+define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_monotonic_i8_shared_sys(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -378,9 +371,8 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_sys_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -391,7 +383,7 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_param_1];
+; SM90-NEXT:    ld.param.u8 %r15, [monotonic_monotonic_i8_shared_sys_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
@@ -399,7 +391,7 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB8_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -409,15 +401,14 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB8_1;
 ; SM90-NEXT:  $L__BB8_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic
     ret i8 %new
 }
 
-define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acquire_monotonic_i8_generic(
+define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_monotonic_i8_shared_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -425,8 +416,8 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_generic_param_0];
+; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_cta_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -437,15 +428,15 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [acquire_monotonic_i8_generic_param_1];
+; SM90-NEXT:    ld.param.u8 %r15, [monotonic_monotonic_i8_shared_cta_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.b32 %r16, [%rd1];
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB9_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB9_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -455,15 +446,14 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB9_1;
 ; SM90-NEXT:  $L__BB9_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic
     ret i8 %new
 }
 
-define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acquire_monotonic_i8_global(
+define i8 @monotonic_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_monotonic_i8_shared_cluster(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -471,8 +461,8 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_global_param_0];
+; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_cluster_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -483,15 +473,15 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [acquire_monotonic_i8_global_param_1];
+; SM90-NEXT:    ld.param.u8 %r15, [monotonic_monotonic_i8_shared_cluster_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB10_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB10_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -501,15 +491,14 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB10_1;
 ; SM90-NEXT:  $L__BB10_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic monotonic
     ret i8 %new
 }
 
-define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acquire_monotonic_i8_shared(
+define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_monotonic_i8_shared_gpu(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -517,8 +506,8 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_shared_param_0];
+; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_gpu_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -529,7 +518,7 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [acquire_monotonic_i8_shared_param_1];
+; SM90-NEXT:    ld.param.u8 %r15, [monotonic_monotonic_i8_shared_gpu_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
@@ -537,7 +526,7 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB11_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -547,15 +536,14 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB11_1;
 ; SM90-NEXT:  $L__BB11_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic
     ret i8 %new
 }
 
-define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acquire_acquire_i8_generic(
+define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_acquire_i8_generic_sys(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -563,8 +551,8 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_generic_param_0];
+; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_acquire_i8_generic_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i8_generic_sys_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -575,7 +563,7 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [acquire_acquire_i8_generic_param_1];
+; SM90-NEXT:    ld.param.u8 %r15, [monotonic_acquire_i8_generic_sys_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM90-NEXT:    ld.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
@@ -583,7 +571,7 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB12_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -596,12 +584,12 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    fence.acquire.sys;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire
     ret i8 %new
 }
 
-define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acquire_acquire_i8_global(
+define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_acquire_i8_generic_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -609,8 +597,8 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_global_param_0];
+; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_acquire_i8_generic_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i8_generic_cta_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -621,15 +609,15 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [acquire_acquire_i8_global_param_1];
+; SM90-NEXT:    ld.param.u8 %r15, [monotonic_acquire_i8_generic_cta_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB13_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB13_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -639,15 +627,15 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB13_1;
 ; SM90-NEXT:  $L__BB13_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    fence.acquire.cta;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire
     ret i8 %new
 }
 
-define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acquire_acquire_i8_shared(
+define i8 @monotonic_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_acquire_i8_generic_cluster(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -655,8 +643,8 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_shared_param_0];
+; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_acquire_i8_generic_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i8_generic_cluster_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -667,15 +655,15 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [acquire_acquire_i8_shared_param_1];
+; SM90-NEXT:    ld.param.u8 %r15, [monotonic_acquire_i8_generic_cluster_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB14_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB14_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -685,15 +673,15 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB14_1;
 ; SM90-NEXT:  $L__BB14_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    fence.acquire.cluster;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic acquire
     ret i8 %new
 }
 
-define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acquire_seq_cst_i8_generic(
+define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_acquire_i8_generic_gpu(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -701,9 +689,8 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_acquire_i8_generic_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i8_generic_gpu_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -714,7 +701,7 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [acquire_seq_cst_i8_generic_param_1];
+; SM90-NEXT:    ld.param.u8 %r15, [monotonic_acquire_i8_generic_gpu_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM90-NEXT:    ld.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
@@ -722,7 +709,7 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB15_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -732,15 +719,15 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB15_1;
 ; SM90-NEXT:  $L__BB15_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    fence.acquire.gpu;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire
     ret i8 %new
 }
 
-define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acquire_seq_cst_i8_global(
+define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_acquire_i8_global_sys(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -748,9 +735,8 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_acquire_i8_global_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i8_global_sys_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -761,7 +747,7 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [acquire_seq_cst_i8_global_param_1];
+; SM90-NEXT:    ld.param.u8 %r15, [monotonic_acquire_i8_global_sys_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
@@ -769,7 +755,7 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB16_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -782,12 +768,12 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    fence.acquire.sys;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire
     ret i8 %new
 }
 
-define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acquire_seq_cst_i8_shared(
+define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_acquire_i8_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -795,9 +781,8 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_acquire_i8_global_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i8_global_cta_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -808,15 +793,15 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [acquire_seq_cst_i8_shared_param_1];
+; SM90-NEXT:    ld.param.u8 %r15, [monotonic_acquire_i8_global_cta_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB17_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB17_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -826,15 +811,15 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB17_1;
 ; SM90-NEXT:  $L__BB17_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    fence.acquire.cta;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire
     ret i8 %new
 }
 
-define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: release_monotonic_i8_generic(
+define i8 @monotonic_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_acquire_i8_global_cluster(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -842,9 +827,8 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_generic_param_0];
-; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_acquire_i8_global_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i8_global_cluster_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -855,15 +839,15 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [release_monotonic_i8_generic_param_1];
+; SM90-NEXT:    ld.param.u8 %r15, [monotonic_acquire_i8_global_cluster_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.b32 %r16, [%rd1];
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB18_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB18_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -873,14 +857,15 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB18_1;
 ; SM90-NEXT:  $L__BB18_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic acquire
     ret i8 %new
 }
 
-define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: release_monotonic_i8_global(
+define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_acquire_i8_global_gpu(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -888,9 +873,8 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_global_param_0];
-; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_acquire_i8_global_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i8_global_gpu_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -901,7 +885,7 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [release_monotonic_i8_global_param_1];
+; SM90-NEXT:    ld.param.u8 %r15, [monotonic_acquire_i8_global_gpu_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
@@ -909,7 +893,7 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB19_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -919,14 +903,15 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB19_1;
 ; SM90-NEXT:  $L__BB19_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire
     ret i8 %new
 }
 
-define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: release_monotonic_i8_shared(
+define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_acquire_i8_shared_sys(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -934,9 +919,8 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_shared_param_0];
-; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_acquire_i8_shared_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i8_shared_sys_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -947,7 +931,7 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [release_monotonic_i8_shared_param_1];
+; SM90-NEXT:    ld.param.u8 %r15, [monotonic_acquire_i8_shared_sys_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
@@ -955,7 +939,7 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB20_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -965,14 +949,15 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB20_1;
 ; SM90-NEXT:  $L__BB20_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire
     ret i8 %new
 }
 
-define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: release_acquire_i8_generic(
+define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_acquire_i8_shared_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -980,9 +965,8 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_generic_param_0];
-; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_acquire_i8_shared_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i8_shared_cta_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -993,15 +977,15 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [release_acquire_i8_generic_param_1];
+; SM90-NEXT:    ld.param.u8 %r15, [monotonic_acquire_i8_shared_cta_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.b32 %r16, [%rd1];
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB21_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB21_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1011,15 +995,15 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB21_1;
 ; SM90-NEXT:  $L__BB21_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    fence.acquire.cta;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire
     ret i8 %new
 }
 
-define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: release_acquire_i8_global(
+define i8 @monotonic_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_acquire_i8_shared_cluster(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -1027,9 +1011,8 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_global_param_0];
-; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_acquire_i8_shared_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i8_shared_cluster_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -1040,15 +1023,15 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [release_acquire_i8_global_param_1];
+; SM90-NEXT:    ld.param.u8 %r15, [monotonic_acquire_i8_shared_cluster_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB22_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB22_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1058,15 +1041,15 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB22_1;
 ; SM90-NEXT:  $L__BB22_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    fence.acquire.cluster;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic acquire
     ret i8 %new
 }
 
-define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: release_acquire_i8_shared(
+define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_acquire_i8_shared_gpu(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -1074,9 +1057,8 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_shared_param_0];
-; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_acquire_i8_shared_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i8_shared_gpu_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -1087,7 +1069,7 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [release_acquire_i8_shared_param_1];
+; SM90-NEXT:    ld.param.u8 %r15, [monotonic_acquire_i8_shared_gpu_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
@@ -1095,7 +1077,7 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB23_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1105,15 +1087,15 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB23_1;
 ; SM90-NEXT:  $L__BB23_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    fence.acquire.gpu;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire
     ret i8 %new
 }
 
-define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: release_seq_cst_i8_generic(
+define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_seq_cst_i8_generic_sys(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -1121,8 +1103,8 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_generic_param_0];
+; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_sys_param_0];
 ; SM90-NEXT:    fence.sc.sys;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1134,7 +1116,7 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [release_seq_cst_i8_generic_param_1];
+; SM90-NEXT:    ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_sys_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM90-NEXT:    ld.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
@@ -1142,7 +1124,7 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB24_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1155,12 +1137,12 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    fence.acquire.sys;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst
     ret i8 %new
 }
 
-define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: release_seq_cst_i8_global(
+define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_seq_cst_i8_generic_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -1168,9 +1150,9 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -1181,15 +1163,15 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [release_seq_cst_i8_global_param_1];
+; SM90-NEXT:    ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_cta_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB25_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB25_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1199,15 +1181,15 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB25_1;
 ; SM90-NEXT:  $L__BB25_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    fence.acquire.cta;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst
     ret i8 %new
 }
 
-define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: release_seq_cst_i8_shared(
+define i8 @monotonic_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_seq_cst_i8_generic_cluster(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -1215,9 +1197,9 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -1228,15 +1210,15 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [release_seq_cst_i8_shared_param_1];
+; SM90-NEXT:    ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_cluster_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB26_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB26_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1246,15 +1228,15 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB26_1;
 ; SM90-NEXT:  $L__BB26_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    fence.acquire.cluster;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic seq_cst
     ret i8 %new
 }
 
-define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acq_rel_monotonic_i8_generic(
+define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_seq_cst_i8_generic_gpu(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -1262,9 +1244,9 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_param_0];
-; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -1275,7 +1257,7 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_param_1];
+; SM90-NEXT:    ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_gpu_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM90-NEXT:    ld.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
@@ -1283,7 +1265,7 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB27_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1293,15 +1275,15 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB27_1;
 ; SM90-NEXT:  $L__BB27_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    fence.acquire.gpu;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst
     ret i8 %new
 }
 
-define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acq_rel_monotonic_i8_global(
+define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_seq_cst_i8_global_sys(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -1309,9 +1291,9 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_param_0];
-; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -1322,7 +1304,7 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [acq_rel_monotonic_i8_global_param_1];
+; SM90-NEXT:    ld.param.u8 %r15, [monotonic_seq_cst_i8_global_sys_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
@@ -1330,7 +1312,7 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB28_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1343,12 +1325,12 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    fence.acquire.sys;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst
     ret i8 %new
 }
 
-define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acq_rel_monotonic_i8_shared(
+define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_seq_cst_i8_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -1356,9 +1338,9 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_param_0];
-; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -1369,15 +1351,15 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_param_1];
+; SM90-NEXT:    ld.param.u8 %r15, [monotonic_seq_cst_i8_global_cta_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB29_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB29_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1387,15 +1369,15 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB29_1;
 ; SM90-NEXT:  $L__BB29_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    fence.acquire.cta;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst
     ret i8 %new
 }
 
-define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acq_rel_acquire_i8_generic(
+define i8 @monotonic_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_seq_cst_i8_global_cluster(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -1403,9 +1385,9 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_param_0];
-; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -1416,15 +1398,15 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [acq_rel_acquire_i8_generic_param_1];
+; SM90-NEXT:    ld.param.u8 %r15, [monotonic_seq_cst_i8_global_cluster_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.b32 %r16, [%rd1];
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB30_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB30_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1434,15 +1416,15 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB30_1;
 ; SM90-NEXT:  $L__BB30_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    fence.acquire.cluster;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic seq_cst
     ret i8 %new
 }
 
-define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acq_rel_acquire_i8_global(
+define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_seq_cst_i8_global_gpu(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -1450,9 +1432,9 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0];
-; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -1463,7 +1445,7 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [acq_rel_acquire_i8_global_param_1];
+; SM90-NEXT:    ld.param.u8 %r15, [monotonic_seq_cst_i8_global_gpu_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
@@ -1471,7 +1453,7 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB31_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1481,15 +1463,15 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB31_1;
 ; SM90-NEXT:  $L__BB31_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    fence.acquire.gpu;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst
     ret i8 %new
 }
 
-define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acq_rel_acquire_i8_shared(
+define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_seq_cst_i8_shared_sys(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -1497,9 +1479,9 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_param_0];
-; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -1510,7 +1492,7 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [acq_rel_acquire_i8_shared_param_1];
+; SM90-NEXT:    ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_sys_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
@@ -1518,7 +1500,7 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB32_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1531,12 +1513,12 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    fence.acquire.sys;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst
     ret i8 %new
 }
 
-define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acq_rel_seq_cst_i8_generic(
+define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_seq_cst_i8_shared_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -1544,9 +1526,9 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -1557,15 +1539,15 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_param_1];
+; SM90-NEXT:    ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_cta_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.b32 %r16, [%rd1];
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB33_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB33_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1575,15 +1557,15 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB33_1;
 ; SM90-NEXT:  $L__BB33_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    fence.acquire.cta;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst
     ret i8 %new
 }
 
-define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acq_rel_seq_cst_i8_global(
+define i8 @monotonic_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_seq_cst_i8_shared_cluster(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -1591,9 +1573,9 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -1604,15 +1586,15 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_param_1];
+; SM90-NEXT:    ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_cluster_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB34_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB34_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1622,15 +1604,15 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB34_1;
 ; SM90-NEXT:  $L__BB34_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    fence.acquire.cluster;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic seq_cst
     ret i8 %new
 }
 
-define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acq_rel_seq_cst_i8_shared(
+define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_seq_cst_i8_shared_gpu(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -1638,9 +1620,9 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -1651,7 +1633,7 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_param_1];
+; SM90-NEXT:    ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_gpu_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
@@ -1659,7 +1641,7 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB35_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1669,15 +1651,15 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB35_1;
 ; SM90-NEXT:  $L__BB35_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    fence.acquire.gpu;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst
     ret i8 %new
 }
 
-define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: seq_cst_monotonic_i8_generic(
+define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_monotonic_i8_generic_sys(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -1685,9 +1667,8 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u8 %rs1, [acquire_monotonic_i8_generic_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i8_generic_sys_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -1698,7 +1679,7 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_param_1];
+; SM90-NEXT:    ld.param.u8 %r15, [acquire_monotonic_i8_generic_sys_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM90-NEXT:    ld.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
@@ -1706,7 +1687,7 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB36_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1719,12 +1700,12 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    fence.acquire.sys;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic
     ret i8 %new
 }
 
-define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: seq_cst_monotonic_i8_global(
+define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_monotonic_i8_generic_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -1732,9 +1713,8 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u8 %rs1, [acquire_monotonic_i8_generic_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i8_generic_cta_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -1745,15 +1725,15 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [seq_cst_monotonic_i8_global_param_1];
+; SM90-NEXT:    ld.param.u8 %r15, [acquire_monotonic_i8_generic_cta_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB37_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB37_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1763,15 +1743,15 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB37_1;
 ; SM90-NEXT:  $L__BB37_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    fence.acquire.cta;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic
     ret i8 %new
 }
 
-define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: seq_cst_monotonic_i8_shared(
+define i8 @acquire_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_monotonic_i8_generic_cluster(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -1779,9 +1759,8 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u8 %rs1, [acquire_monotonic_i8_generic_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i8_generic_cluster_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -1792,15 +1771,15 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_param_1];
+; SM90-NEXT:    ld.param.u8 %r15, [acquire_monotonic_i8_generic_cluster_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB38_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB38_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1810,15 +1789,15 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB38_1;
 ; SM90-NEXT:  $L__BB38_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    fence.acquire.cluster;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acquire monotonic
     ret i8 %new
 }
 
-define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: seq_cst_acquire_i8_generic(
+define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_monotonic_i8_generic_gpu(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -1826,9 +1805,8 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u8 %rs1, [acquire_monotonic_i8_generic_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i8_generic_gpu_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -1839,7 +1817,7 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [seq_cst_acquire_i8_generic_param_1];
+; SM90-NEXT:    ld.param.u8 %r15, [acquire_monotonic_i8_generic_gpu_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM90-NEXT:    ld.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
@@ -1847,7 +1825,7 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB39_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1857,15 +1835,15 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB39_1;
 ; SM90-NEXT:  $L__BB39_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    fence.acquire.gpu;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic
     ret i8 %new
 }
 
-define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: seq_cst_acquire_i8_global(
+define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_monotonic_i8_global_sys(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -1873,9 +1851,8 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u8 %rs1, [acquire_monotonic_i8_global_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i8_global_sys_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -1886,7 +1863,7 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [seq_cst_acquire_i8_global_param_1];
+; SM90-NEXT:    ld.param.u8 %r15, [acquire_monotonic_i8_global_sys_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
@@ -1894,7 +1871,7 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB40_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1907,12 +1884,12 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    fence.acquire.sys;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic
     ret i8 %new
 }
 
-define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: seq_cst_acquire_i8_shared(
+define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_monotonic_i8_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -1920,9 +1897,8 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u8 %rs1, [acquire_monotonic_i8_global_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i8_global_cta_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -1933,15 +1909,15 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [seq_cst_acquire_i8_shared_param_1];
+; SM90-NEXT:    ld.param.u8 %r15, [acquire_monotonic_i8_global_cta_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB41_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB41_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1951,15 +1927,15 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB41_1;
 ; SM90-NEXT:  $L__BB41_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    fence.acquire.cta;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic
     ret i8 %new
 }
 
-define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: seq_cst_seq_cst_i8_generic(
+define i8 @acquire_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_monotonic_i8_global_cluster(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -1967,9 +1943,8 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u8 %rs1, [acquire_monotonic_i8_global_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i8_global_cluster_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -1980,15 +1955,15 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_param_1];
+; SM90-NEXT:    ld.param.u8 %r15, [acquire_monotonic_i8_global_cluster_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.b32 %r16, [%rd1];
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB42_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB42_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1998,15 +1973,15 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB42_1;
 ; SM90-NEXT:  $L__BB42_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    fence.acquire.cluster;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire monotonic
     ret i8 %new
 }
 
-define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: seq_cst_seq_cst_i8_global(
+define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_monotonic_i8_global_gpu(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -2014,9 +1989,8 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u8 %rs1, [acquire_monotonic_i8_global_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i8_global_gpu_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -2027,7 +2001,7 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_param_1];
+; SM90-NEXT:    ld.param.u8 %r15, [acquire_monotonic_i8_global_gpu_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
@@ -2035,7 +2009,7 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB43_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -2045,15 +2019,15 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB43_1;
 ; SM90-NEXT:  $L__BB43_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    fence.acquire.gpu;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic
     ret i8 %new
 }
 
-define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: seq_cst_seq_cst_i8_shared(
+define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_monotonic_i8_shared_sys(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -2061,9 +2035,8 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u8 %rs1, [acquire_monotonic_i8_shared_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i8_shared_sys_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -2074,7 +2047,7 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_param_1];
+; SM90-NEXT:    ld.param.u8 %r15, [acquire_monotonic_i8_shared_sys_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
 ; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
@@ -2082,7 +2055,7 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB44_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -2095,3586 +2068,20641 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    fence.acquire.sys;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic
     ret i8 %new
 }
 
-define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: monotonic_monotonic_i16_generic(
+define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_monotonic_i8_shared_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<21>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_param_0];
+; SM90-NEXT:    ld.param.u8 %rs1, [acquire_monotonic_i8_shared_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i8_shared_cta_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_generic_param_1];
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    ld.param.u8 %r15, [acquire_monotonic_i8_shared_cta_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB45_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB45_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB45_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB45_1;
 ; SM90-NEXT:  $L__BB45_3: // %partword.cmpxchg.end
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic
+    ret i8 %new
 }
 
-define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: monotonic_monotonic_i16_global(
+define i8 @acquire_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_monotonic_i8_shared_cluster(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<21>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_global_param_0];
+; SM90-NEXT:    ld.param.u8 %rs1, [acquire_monotonic_i8_shared_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i8_shared_cluster_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_global_param_1];
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    ld.param.u8 %r15, [acquire_monotonic_i8_shared_cluster_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB46_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB46_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB46_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB46_1;
 ; SM90-NEXT:  $L__BB46_3: // %partword.cmpxchg.end
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic monotonic
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire monotonic
+    ret i8 %new
 }
 
-define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: monotonic_monotonic_i16_shared(
+define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_monotonic_i8_shared_gpu(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<21>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_param_0];
+; SM90-NEXT:    ld.param.u8 %rs1, [acquire_monotonic_i8_shared_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i8_shared_gpu_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_shared_param_1];
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    ld.param.u8 %r15, [acquire_monotonic_i8_shared_gpu_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB47_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB47_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB47_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB47_1;
 ; SM90-NEXT:  $L__BB47_3: // %partword.cmpxchg.end
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic monotonic
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic
+    ret i8 %new
 }
 
-define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: monotonic_acquire_i16_generic(
+define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_acquire_i8_generic_sys(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<21>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_generic_param_0];
+; SM90-NEXT:    ld.param.u8 %rs1, [acquire_acquire_i8_generic_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i8_generic_sys_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_generic_param_1];
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    ld.param.u8 %r15, [acquire_acquire_i8_generic_sys_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB48_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB48_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB48_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB48_1;
 ; SM90-NEXT:  $L__BB48_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic acquire
-    ret i16 %new
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire acquire
+    ret i8 %new
 }
 
-define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: monotonic_acquire_i16_global(
+define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_acquire_i8_generic_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<21>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_global_param_0];
+; SM90-NEXT:    ld.param.u8 %rs1, [acquire_acquire_i8_generic_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i8_generic_cta_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_global_param_1];
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    ld.param.u8 %r15, [acquire_acquire_i8_generic_cta_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB49_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB49_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB49_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB49_1;
 ; SM90-NEXT:  $L__BB49_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic acquire
-    ret i16 %new
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire
+    ret i8 %new
 }
 
-define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: monotonic_acquire_i16_shared(
+define i8 @acquire_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_acquire_i8_generic_cluster(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<21>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_shared_param_0];
+; SM90-NEXT:    ld.param.u8 %rs1, [acquire_acquire_i8_generic_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i8_generic_cluster_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_shared_param_1];
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    ld.param.u8 %r15, [acquire_acquire_i8_generic_cluster_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB50_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB50_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB50_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB50_1;
 ; SM90-NEXT:  $L__BB50_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic acquire
-    ret i16 %new
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acquire acquire
+    ret i8 %new
 }
 
-define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: monotonic_seq_cst_i16_generic(
+define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_acquire_i8_generic_gpu(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<21>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_param_1];
+; SM90-NEXT:    ld.param.u8 %rs1, [acquire_acquire_i8_generic_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i8_generic_gpu_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    ld.param.u8 %r15, [acquire_acquire_i8_generic_gpu_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB51_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB51_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB51_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB51_1;
 ; SM90-NEXT:  $L__BB51_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic seq_cst
-    ret i16 %new
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire
+    ret i8 %new
 }
 
-define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: monotonic_seq_cst_i16_global(
+define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_acquire_i8_global_sys(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<21>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_global_param_1];
+; SM90-NEXT:    ld.param.u8 %rs1, [acquire_acquire_i8_global_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i8_global_sys_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    ld.param.u8 %r15, [acquire_acquire_i8_global_sys_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB52_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB52_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB52_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB52_1;
 ; SM90-NEXT:  $L__BB52_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic seq_cst
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire
+    ret i8 %new
 }
 
-define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: monotonic_seq_cst_i16_shared(
+define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_acquire_i8_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<21>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_param_1];
+; SM90-NEXT:    ld.param.u8 %rs1, [acquire_acquire_i8_global_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i8_global_cta_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    ld.param.u8 %r15, [acquire_acquire_i8_global_cta_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB53_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB53_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB53_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB53_1;
 ; SM90-NEXT:  $L__BB53_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic seq_cst
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire
+    ret i8 %new
 }
 
-define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acquire_monotonic_i16_generic(
+define i8 @acquire_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_acquire_i8_global_cluster(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<21>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_generic_param_0];
+; SM90-NEXT:    ld.param.u8 %rs1, [acquire_acquire_i8_global_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i8_global_cluster_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_generic_param_1];
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    ld.param.u8 %r15, [acquire_acquire_i8_global_cluster_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB54_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB54_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB54_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB54_1;
 ; SM90-NEXT:  $L__BB54_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire monotonic
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire acquire
+    ret i8 %new
 }
 
-define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acquire_monotonic_i16_global(
+define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_acquire_i8_global_gpu(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<21>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_global_param_0];
+; SM90-NEXT:    ld.param.u8 %rs1, [acquire_acquire_i8_global_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i8_global_gpu_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_global_param_1];
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    ld.param.u8 %r15, [acquire_acquire_i8_global_gpu_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB55_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB55_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB55_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB55_1;
 ; SM90-NEXT:  $L__BB55_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire monotonic
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire
+    ret i8 %new
 }
 
-define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acquire_monotonic_i16_shared(
+define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_acquire_i8_shared_sys(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<21>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_shared_param_0];
+; SM90-NEXT:    ld.param.u8 %rs1, [acquire_acquire_i8_shared_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i8_shared_sys_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_shared_param_1];
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    ld.param.u8 %r15, [acquire_acquire_i8_shared_sys_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB56_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB56_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB56_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB56_1;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB56_1;
 ; SM90-NEXT:  $L__BB56_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire monotonic
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire
+    ret i8 %new
 }
 
-define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acquire_acquire_i16_generic(
+define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_acquire_i8_shared_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<21>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_generic_param_0];
+; SM90-NEXT:    ld.param.u8 %rs1, [acquire_acquire_i8_shared_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i8_shared_cta_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_generic_param_1];
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    ld.param.u8 %r15, [acquire_acquire_i8_shared_cta_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB57_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB57_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB57_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB57_1;
 ; SM90-NEXT:  $L__BB57_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire acquire
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire
+    ret i8 %new
 }
 
-define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acquire_acquire_i16_global(
+define i8 @acquire_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_acquire_i8_shared_cluster(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<21>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_global_param_0];
+; SM90-NEXT:    ld.param.u8 %rs1, [acquire_acquire_i8_shared_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i8_shared_cluster_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_global_param_1];
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    ld.param.u8 %r15, [acquire_acquire_i8_shared_cluster_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB58_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB58_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB58_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB58_1;
 ; SM90-NEXT:  $L__BB58_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire acquire
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire acquire
+    ret i8 %new
 }
 
-define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acquire_acquire_i16_shared(
+define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_acquire_i8_shared_gpu(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<21>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_shared_param_0];
+; SM90-NEXT:    ld.param.u8 %rs1, [acquire_acquire_i8_shared_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i8_shared_gpu_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_shared_param_1];
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    ld.param.u8 %r15, [acquire_acquire_i8_shared_gpu_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB59_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB59_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB59_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB59_1;
 ; SM90-NEXT:  $L__BB59_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire acquire
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire
+    ret i8 %new
 }
 
-define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acquire_seq_cst_i16_generic(
+define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_seq_cst_i8_generic_sys(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<21>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_param_0];
+; SM90-NEXT:    ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_sys_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_generic_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    ld.param.u8 %r15, [acquire_seq_cst_i8_generic_sys_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB60_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB60_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB60_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB60_1;
 ; SM90-NEXT:  $L__BB60_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire seq_cst
-    ret i16 %new
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst
+    ret i8 %new
 }
 
-define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acquire_seq_cst_i16_global(
+define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_seq_cst_i8_generic_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<21>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_global_param_1];
+; SM90-NEXT:    ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    ld.param.u8 %r15, [acquire_seq_cst_i8_generic_cta_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB61_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB61_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB61_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB61_1;
 ; SM90-NEXT:  $L__BB61_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire seq_cst
-    ret i16 %new
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst
+    ret i8 %new
 }
 
-define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acquire_seq_cst_i16_shared(
+define i8 @acquire_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_seq_cst_i8_generic_cluster(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<21>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_shared_param_1];
+; SM90-NEXT:    ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    ld.param.u8 %r15, [acquire_seq_cst_i8_generic_cluster_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB62_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB62_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB62_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB62_1;
 ; SM90-NEXT:  $L__BB62_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire seq_cst
-    ret i16 %new
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acquire seq_cst
+    ret i8 %new
 }
 
-define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: release_monotonic_i16_generic(
+define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_seq_cst_i8_generic_gpu(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<21>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_generic_param_0];
-; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_generic_param_1];
+; SM90-NEXT:    ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    ld.param.u8 %r15, [acquire_seq_cst_i8_generic_gpu_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB63_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB63_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB63_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB63_1;
 ; SM90-NEXT:  $L__BB63_3: // %partword.cmpxchg.end
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release monotonic
-    ret i16 %new
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst
+    ret i8 %new
 }
 
-define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: release_monotonic_i16_global(
+define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_seq_cst_i8_global_sys(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<21>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_global_param_0];
-; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_global_param_1];
+; SM90-NEXT:    ld.param.u8 %rs1, [acquire_seq_cst_i8_global_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i8_global_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    ld.param.u8 %r15, [acquire_seq_cst_i8_global_sys_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB64_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB64_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB64_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB64_1;
 ; SM90-NEXT:  $L__BB64_3: // %partword.cmpxchg.end
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release monotonic
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst
+    ret i8 %new
 }
 
-define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: release_monotonic_i16_shared(
+define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_seq_cst_i8_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<21>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_shared_param_0];
-; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_shared_param_1];
+; SM90-NEXT:    ld.param.u8 %rs1, [acquire_seq_cst_i8_global_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i8_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    ld.param.u8 %r15, [acquire_seq_cst_i8_global_cta_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB65_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB65_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB65_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB65_1;
 ; SM90-NEXT:  $L__BB65_3: // %partword.cmpxchg.end
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release monotonic
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst
+    ret i8 %new
 }
 
-define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: release_acquire_i16_generic(
+define i8 @acquire_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_seq_cst_i8_global_cluster(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<21>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_generic_param_0];
-; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [release_acquire_i16_generic_param_1];
+; SM90-NEXT:    ld.param.u8 %rs1, [acquire_seq_cst_i8_global_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i8_global_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    ld.param.u8 %r15, [acquire_seq_cst_i8_global_cluster_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB66_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB66_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB66_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB66_1;
 ; SM90-NEXT:  $L__BB66_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release acquire
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire seq_cst
+    ret i8 %new
 }
 
-define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: release_acquire_i16_global(
+define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_seq_cst_i8_global_gpu(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<21>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_global_param_0];
-; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [release_acquire_i16_global_param_1];
+; SM90-NEXT:    ld.param.u8 %rs1, [acquire_seq_cst_i8_global_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i8_global_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    ld.param.u8 %r15, [acquire_seq_cst_i8_global_gpu_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB67_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB67_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB67_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB67_1;
 ; SM90-NEXT:  $L__BB67_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release acquire
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst
+    ret i8 %new
 }
 
-define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: release_acquire_i16_shared(
+define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_seq_cst_i8_shared_sys(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<21>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_shared_param_0];
-; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [release_acquire_i16_shared_param_1];
+; SM90-NEXT:    ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    ld.param.u8 %r15, [acquire_seq_cst_i8_shared_sys_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB68_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB68_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB68_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB68_1;
 ; SM90-NEXT:  $L__BB68_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release acquire
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst
+    ret i8 %new
 }
 
-define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: release_seq_cst_i16_generic(
+define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_seq_cst_i8_shared_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<21>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_generic_param_1];
+; SM90-NEXT:    ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    ld.param.u8 %r15, [acquire_seq_cst_i8_shared_cta_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB69_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB69_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB69_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB69_1;
 ; SM90-NEXT:  $L__BB69_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release seq_cst
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst
+    ret i8 %new
 }
 
-define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: release_seq_cst_i16_global(
+define i8 @acquire_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_seq_cst_i8_shared_cluster(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<21>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_global_param_1];
+; SM90-NEXT:    ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    ld.param.u8 %r15, [acquire_seq_cst_i8_shared_cluster_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB70_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB70_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB70_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB70_1;
 ; SM90-NEXT:  $L__BB70_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release seq_cst
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire seq_cst
+    ret i8 %new
 }
 
-define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: release_seq_cst_i16_shared(
+define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_seq_cst_i8_shared_gpu(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<21>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_shared_param_1];
+; SM90-NEXT:    ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    ld.param.u8 %r15, [acquire_seq_cst_i8_shared_gpu_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB71_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB71_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB71_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB71_1;
 ; SM90-NEXT:  $L__BB71_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release seq_cst
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst
+    ret i8 %new
 }
 
-define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acq_rel_monotonic_i16_generic(
+define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_monotonic_i8_generic_sys(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<21>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_param_0];
+; SM90-NEXT:    ld.param.u8 %rs1, [release_monotonic_i8_generic_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i8_generic_sys_param_0];
 ; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    ld.param.u8 %r15, [release_monotonic_i8_generic_sys_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB72_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB72_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB72_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB72_1;
 ; SM90-NEXT:  $L__BB72_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel monotonic
-    ret i16 %new
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release monotonic
+    ret i8 %new
 }
 
-define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acq_rel_monotonic_i16_global(
+define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_monotonic_i8_generic_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<21>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_param_0];
-; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_global_param_1];
+; SM90-NEXT:    ld.param.u8 %rs1, [release_monotonic_i8_generic_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i8_generic_cta_param_0];
+; SM90-NEXT:    fence.release.cta;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    ld.param.u8 %r15, [release_monotonic_i8_generic_cta_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB73_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB73_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB73_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB73_1;
 ; SM90-NEXT:  $L__BB73_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel monotonic
-    ret i16 %new
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release monotonic
+    ret i8 %new
 }
 
-define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acq_rel_monotonic_i16_shared(
+define i8 @release_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_monotonic_i8_generic_cluster(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<21>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_param_0];
-; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_param_1];
+; SM90-NEXT:    ld.param.u8 %rs1, [release_monotonic_i8_generic_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i8_generic_cluster_param_0];
+; SM90-NEXT:    fence.release.cluster;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    ld.param.u8 %r15, [release_monotonic_i8_generic_cluster_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB74_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB74_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB74_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB74_1;
 ; SM90-NEXT:  $L__BB74_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel monotonic
-    ret i16 %new
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") release monotonic
+    ret i8 %new
 }
 
-define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acq_rel_acquire_i16_generic(
+define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_monotonic_i8_generic_gpu(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<21>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_param_0];
-; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_generic_param_1];
+; SM90-NEXT:    ld.param.u8 %rs1, [release_monotonic_i8_generic_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i8_generic_gpu_param_0];
+; SM90-NEXT:    fence.release.gpu;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    ld.param.u8 %r15, [release_monotonic_i8_generic_gpu_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB75_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB75_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB75_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB75_1;
 ; SM90-NEXT:  $L__BB75_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel acquire
-    ret i16 %new
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release monotonic
+    ret i8 %new
 }
 
-define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acq_rel_acquire_i16_global(
+define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_monotonic_i8_global_sys(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<21>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_global_param_0];
+; SM90-NEXT:    ld.param.u8 %rs1, [release_monotonic_i8_global_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i8_global_sys_param_0];
 ; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_global_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    ld.param.u8 %r15, [release_monotonic_i8_global_sys_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB76_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB76_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB76_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB76_1;
 ; SM90-NEXT:  $L__BB76_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel acquire
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release monotonic
+    ret i8 %new
 }
 
-define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acq_rel_acquire_i16_shared(
+define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_monotonic_i8_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<21>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_param_0];
-; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_shared_param_1];
+; SM90-NEXT:    ld.param.u8 %rs1, [release_monotonic_i8_global_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i8_global_cta_param_0];
+; SM90-NEXT:    fence.release.cta;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    ld.param.u8 %r15, [release_monotonic_i8_global_cta_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB77_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB77_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB77_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB77_1;
 ; SM90-NEXT:  $L__BB77_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel acquire
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic
+    ret i8 %new
 }
 
-define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acq_rel_seq_cst_i16_generic(
+define i8 @release_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_monotonic_i8_global_cluster(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<21>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_param_1];
+; SM90-NEXT:    ld.param.u8 %rs1, [release_monotonic_i8_global_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i8_global_cluster_param_0];
+; SM90-NEXT:    fence.release.cluster;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    ld.param.u8 %r15, [release_monotonic_i8_global_cluster_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB78_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB78_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB78_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB78_1;
 ; SM90-NEXT:  $L__BB78_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel seq_cst
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") release monotonic
+    ret i8 %new
 }
 
-define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acq_rel_seq_cst_i16_global(
+define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_monotonic_i8_global_gpu(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<21>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_param_1];
+; SM90-NEXT:    ld.param.u8 %rs1, [release_monotonic_i8_global_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i8_global_gpu_param_0];
+; SM90-NEXT:    fence.release.gpu;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    ld.param.u8 %r15, [release_monotonic_i8_global_gpu_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB79_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB79_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB79_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB79_1;
 ; SM90-NEXT:  $L__BB79_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel seq_cst
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic
+    ret i8 %new
 }
 
-define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acq_rel_seq_cst_i16_shared(
+define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_monotonic_i8_shared_sys(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<21>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_param_1];
+; SM90-NEXT:    ld.param.u8 %rs1, [release_monotonic_i8_shared_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i8_shared_sys_param_0];
+; SM90-NEXT:    fence.release.sys;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    ld.param.u8 %r15, [release_monotonic_i8_shared_sys_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB80_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB80_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB80_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB80_1;
 ; SM90-NEXT:  $L__BB80_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel seq_cst
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release monotonic
+    ret i8 %new
 }
 
-define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: seq_cst_monotonic_i16_generic(
+define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_monotonic_i8_shared_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<21>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_param_1];
+; SM90-NEXT:    ld.param.u8 %rs1, [release_monotonic_i8_shared_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i8_shared_cta_param_0];
+; SM90-NEXT:    fence.release.cta;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    ld.param.u8 %r15, [release_monotonic_i8_shared_cta_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB81_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB81_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB81_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB81_1;
 ; SM90-NEXT:  $L__BB81_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst monotonic
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic
+    ret i8 %new
 }
 
-define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: seq_cst_monotonic_i16_global(
+define i8 @release_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_monotonic_i8_shared_cluster(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<21>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_global_param_1];
+; SM90-NEXT:    ld.param.u8 %rs1, [release_monotonic_i8_shared_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i8_shared_cluster_param_0];
+; SM90-NEXT:    fence.release.cluster;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    ld.param.u8 %r15, [release_monotonic_i8_shared_cluster_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB82_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB82_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB82_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB82_1;
 ; SM90-NEXT:  $L__BB82_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst monotonic
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") release monotonic
+    ret i8 %new
 }
 
-define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: seq_cst_monotonic_i16_shared(
+define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_monotonic_i8_shared_gpu(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<21>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_param_1];
+; SM90-NEXT:    ld.param.u8 %rs1, [release_monotonic_i8_shared_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i8_shared_gpu_param_0];
+; SM90-NEXT:    fence.release.gpu;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    ld.param.u8 %r15, [release_monotonic_i8_shared_gpu_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB83_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB83_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB83_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB83_1;
 ; SM90-NEXT:  $L__BB83_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst monotonic
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic
+    ret i8 %new
 }
 
-define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: seq_cst_acquire_i16_generic(
+define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_acquire_i8_generic_sys(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<21>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_generic_param_1];
+; SM90-NEXT:    ld.param.u8 %rs1, [release_acquire_i8_generic_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i8_generic_sys_param_0];
+; SM90-NEXT:    fence.release.sys;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    ld.param.u8 %r15, [release_acquire_i8_generic_sys_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB84_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB84_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB84_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB84_1;
 ; SM90-NEXT:  $L__BB84_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst acquire
-    ret i16 %new
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release acquire
+    ret i8 %new
 }
 
-define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: seq_cst_acquire_i16_global(
+define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_acquire_i8_generic_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<21>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_global_param_1];
+; SM90-NEXT:    ld.param.u8 %rs1, [release_acquire_i8_generic_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i8_generic_cta_param_0];
+; SM90-NEXT:    fence.release.cta;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    ld.param.u8 %r15, [release_acquire_i8_generic_cta_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB85_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB85_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB85_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB85_1;
 ; SM90-NEXT:  $L__BB85_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst acquire
-    ret i16 %new
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release acquire
+    ret i8 %new
 }
 
-define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: seq_cst_acquire_i16_shared(
+define i8 @release_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_acquire_i8_generic_cluster(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<21>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_shared_param_1];
+; SM90-NEXT:    ld.param.u8 %rs1, [release_acquire_i8_generic_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i8_generic_cluster_param_0];
+; SM90-NEXT:    fence.release.cluster;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    ld.param.u8 %r15, [release_acquire_i8_generic_cluster_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB86_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB86_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB86_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB86_1;
 ; SM90-NEXT:  $L__BB86_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst acquire
-    ret i16 %new
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") release acquire
+    ret i8 %new
 }
 
-define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: seq_cst_seq_cst_i16_generic(
+define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_acquire_i8_generic_gpu(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<21>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_param_1];
+; SM90-NEXT:    ld.param.u8 %rs1, [release_acquire_i8_generic_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i8_generic_gpu_param_0];
+; SM90-NEXT:    fence.release.gpu;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    ld.param.u8 %r15, [release_acquire_i8_generic_gpu_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB87_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB87_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB87_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB87_1;
 ; SM90-NEXT:  $L__BB87_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst
-    ret i16 %new
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release acquire
+    ret i8 %new
 }
 
-define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: seq_cst_seq_cst_i16_global(
+define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_acquire_i8_global_sys(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<21>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_param_1];
+; SM90-NEXT:    ld.param.u8 %rs1, [release_acquire_i8_global_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i8_global_sys_param_0];
+; SM90-NEXT:    fence.release.sys;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    ld.param.u8 %r15, [release_acquire_i8_global_sys_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB88_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB88_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB88_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB88_1;
 ; SM90-NEXT:  $L__BB88_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst seq_cst
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release acquire
+    ret i8 %new
 }
 
-define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: seq_cst_seq_cst_i16_shared(
+define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_acquire_i8_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<21>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_param_1];
+; SM90-NEXT:    ld.param.u8 %rs1, [release_acquire_i8_global_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i8_global_cta_param_0];
+; SM90-NEXT:    fence.release.cta;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    ld.param.u8 %r15, [release_acquire_i8_global_cta_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB89_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB89_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB89_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB89_1;
 ; SM90-NEXT:  $L__BB89_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst seq_cst
-    ret i16 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire
+    ret i8 %new
 }
 
-define i32 @monotonic_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: monotonic_monotonic_i32_generic(
+define i8 @release_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_acquire_i8_global_cluster(
 ; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_generic_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_generic_param_2];
-; SM90-NEXT:    atom.relaxed.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ld.param.u8 %rs1, [release_acquire_i8_global_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i8_global_cluster_param_0];
+; SM90-NEXT:    fence.release.cluster;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [release_acquire_i8_global_cluster_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB90_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB90_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB90_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB90_1;
+; SM90-NEXT:  $L__BB90_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") release acquire
+    ret i8 %new
 }
 
-define i32 @monotonic_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: monotonic_monotonic_i32_global(
+define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_acquire_i8_global_gpu(
 ; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_global_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_global_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_global_param_2];
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ld.param.u8 %rs1, [release_acquire_i8_global_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i8_global_gpu_param_0];
+; SM90-NEXT:    fence.release.gpu;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [release_acquire_i8_global_gpu_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB91_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB91_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB91_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB91_1;
+; SM90-NEXT:  $L__BB91_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic monotonic
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release acquire
+    ret i8 %new
+}
+
+define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_acquire_i8_shared_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [release_acquire_i8_shared_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i8_shared_sys_param_0];
+; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [release_acquire_i8_shared_sys_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB92_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB92_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB92_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB92_1;
+; SM90-NEXT:  $L__BB92_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release acquire
+    ret i8 %new
+}
+
+define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_acquire_i8_shared_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [release_acquire_i8_shared_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i8_shared_cta_param_0];
+; SM90-NEXT:    fence.release.cta;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [release_acquire_i8_shared_cta_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB93_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB93_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB93_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB93_1;
+; SM90-NEXT:  $L__BB93_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release acquire
+    ret i8 %new
+}
+
+define i8 @release_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_acquire_i8_shared_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [release_acquire_i8_shared_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i8_shared_cluster_param_0];
+; SM90-NEXT:    fence.release.cluster;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [release_acquire_i8_shared_cluster_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB94_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB94_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB94_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB94_1;
+; SM90-NEXT:  $L__BB94_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") release acquire
+    ret i8 %new
+}
+
+define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_acquire_i8_shared_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [release_acquire_i8_shared_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i8_shared_gpu_param_0];
+; SM90-NEXT:    fence.release.gpu;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [release_acquire_i8_shared_gpu_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB95_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB95_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB95_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB95_1;
+; SM90-NEXT:  $L__BB95_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release acquire
+    ret i8 %new
+}
+
+define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_seq_cst_i8_generic_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [release_seq_cst_i8_generic_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i8_generic_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [release_seq_cst_i8_generic_sys_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB96_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB96_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB96_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB96_1;
+; SM90-NEXT:  $L__BB96_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release seq_cst
+    ret i8 %new
+}
+
+define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_seq_cst_i8_generic_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [release_seq_cst_i8_generic_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i8_generic_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [release_seq_cst_i8_generic_cta_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB97_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB97_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB97_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB97_1;
+; SM90-NEXT:  $L__BB97_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst
+    ret i8 %new
+}
+
+define i8 @release_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_seq_cst_i8_generic_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [release_seq_cst_i8_generic_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i8_generic_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [release_seq_cst_i8_generic_cluster_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB98_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB98_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB98_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB98_1;
+; SM90-NEXT:  $L__BB98_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") release seq_cst
+    ret i8 %new
+}
+
+define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_seq_cst_i8_generic_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [release_seq_cst_i8_generic_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i8_generic_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [release_seq_cst_i8_generic_gpu_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB99_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB99_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB99_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB99_1;
+; SM90-NEXT:  $L__BB99_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst
+    ret i8 %new
+}
+
+define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_seq_cst_i8_global_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [release_seq_cst_i8_global_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i8_global_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [release_seq_cst_i8_global_sys_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB100_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB100_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB100_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB100_1;
+; SM90-NEXT:  $L__BB100_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst
+    ret i8 %new
+}
+
+define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_seq_cst_i8_global_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [release_seq_cst_i8_global_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i8_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [release_seq_cst_i8_global_cta_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB101_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB101_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB101_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB101_1;
+; SM90-NEXT:  $L__BB101_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst
+    ret i8 %new
+}
+
+define i8 @release_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_seq_cst_i8_global_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [release_seq_cst_i8_global_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i8_global_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [release_seq_cst_i8_global_cluster_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB102_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB102_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB102_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB102_1;
+; SM90-NEXT:  $L__BB102_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") release seq_cst
+    ret i8 %new
+}
+
+define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_seq_cst_i8_global_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [release_seq_cst_i8_global_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i8_global_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [release_seq_cst_i8_global_gpu_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB103_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB103_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB103_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB103_1;
+; SM90-NEXT:  $L__BB103_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst
+    ret i8 %new
+}
+
+define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_seq_cst_i8_shared_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [release_seq_cst_i8_shared_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i8_shared_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [release_seq_cst_i8_shared_sys_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB104_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB104_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB104_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB104_1;
+; SM90-NEXT:  $L__BB104_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst
+    ret i8 %new
+}
+
+define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_seq_cst_i8_shared_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [release_seq_cst_i8_shared_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i8_shared_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [release_seq_cst_i8_shared_cta_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB105_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB105_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB105_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB105_1;
+; SM90-NEXT:  $L__BB105_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst
+    ret i8 %new
+}
+
+define i8 @release_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_seq_cst_i8_shared_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [release_seq_cst_i8_shared_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i8_shared_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [release_seq_cst_i8_shared_cluster_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB106_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB106_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB106_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB106_1;
+; SM90-NEXT:  $L__BB106_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") release seq_cst
+    ret i8 %new
+}
+
+define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_seq_cst_i8_shared_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [release_seq_cst_i8_shared_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i8_shared_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [release_seq_cst_i8_shared_gpu_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB107_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB107_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB107_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB107_1;
+; SM90-NEXT:  $L__BB107_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst
+    ret i8 %new
+}
+
+define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_monotonic_i8_generic_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_sys_param_0];
+; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_sys_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB108_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB108_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB108_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB108_1;
+; SM90-NEXT:  $L__BB108_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic
+    ret i8 %new
+}
+
+define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_monotonic_i8_generic_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_cta_param_0];
+; SM90-NEXT:    fence.release.cta;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_cta_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB109_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB109_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB109_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB109_1;
+; SM90-NEXT:  $L__BB109_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic
+    ret i8 %new
+}
+
+define i8 @acq_rel_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_monotonic_i8_generic_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_cluster_param_0];
+; SM90-NEXT:    fence.release.cluster;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_cluster_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB110_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB110_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB110_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB110_1;
+; SM90-NEXT:  $L__BB110_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel monotonic
+    ret i8 %new
+}
+
+define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_monotonic_i8_generic_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_gpu_param_0];
+; SM90-NEXT:    fence.release.gpu;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_gpu_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB111_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB111_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB111_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB111_1;
+; SM90-NEXT:  $L__BB111_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic
+    ret i8 %new
+}
+
+define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_monotonic_i8_global_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_sys_param_0];
+; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_monotonic_i8_global_sys_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB112_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB112_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB112_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB112_1;
+; SM90-NEXT:  $L__BB112_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic
+    ret i8 %new
+}
+
+define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_monotonic_i8_global_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0];
+; SM90-NEXT:    fence.release.cta;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_monotonic_i8_global_cta_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB113_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB113_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB113_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB113_1;
+; SM90-NEXT:  $L__BB113_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic
+    ret i8 %new
+}
+
+define i8 @acq_rel_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_monotonic_i8_global_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_cluster_param_0];
+; SM90-NEXT:    fence.release.cluster;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_monotonic_i8_global_cluster_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB114_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB114_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB114_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB114_1;
+; SM90-NEXT:  $L__BB114_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel monotonic
+    ret i8 %new
+}
+
+define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_monotonic_i8_global_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_gpu_param_0];
+; SM90-NEXT:    fence.release.gpu;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_monotonic_i8_global_gpu_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB115_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB115_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB115_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB115_1;
+; SM90-NEXT:  $L__BB115_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic
+    ret i8 %new
+}
+
+define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_monotonic_i8_shared_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_sys_param_0];
+; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_sys_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB116_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB116_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB116_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB116_1;
+; SM90-NEXT:  $L__BB116_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic
+    ret i8 %new
+}
+
+define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_monotonic_i8_shared_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_cta_param_0];
+; SM90-NEXT:    fence.release.cta;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_cta_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB117_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB117_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB117_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB117_1;
+; SM90-NEXT:  $L__BB117_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic
+    ret i8 %new
+}
+
+define i8 @acq_rel_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_monotonic_i8_shared_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_cluster_param_0];
+; SM90-NEXT:    fence.release.cluster;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_cluster_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB118_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB118_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB118_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB118_1;
+; SM90-NEXT:  $L__BB118_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel monotonic
+    ret i8 %new
+}
+
+define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_monotonic_i8_shared_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_gpu_param_0];
+; SM90-NEXT:    fence.release.gpu;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_gpu_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB119_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB119_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB119_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB119_1;
+; SM90-NEXT:  $L__BB119_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic
+    ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_acquire_i8_generic_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_sys_param_0];
+; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_acquire_i8_generic_sys_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB120_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB120_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB120_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB120_1;
+; SM90-NEXT:  $L__BB120_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire
+    ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_acquire_i8_generic_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0];
+; SM90-NEXT:    fence.release.cta;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_acquire_i8_generic_cta_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB121_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB121_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB121_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB121_1;
+; SM90-NEXT:  $L__BB121_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
+    ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_acquire_i8_generic_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_cluster_param_0];
+; SM90-NEXT:    fence.release.cluster;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_acquire_i8_generic_cluster_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB122_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB122_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB122_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB122_1;
+; SM90-NEXT:  $L__BB122_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel acquire
+    ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_acquire_i8_generic_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_gpu_param_0];
+; SM90-NEXT:    fence.release.gpu;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_acquire_i8_generic_gpu_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB123_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB123_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB123_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB123_1;
+; SM90-NEXT:  $L__BB123_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire
+    ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_acquire_i8_global_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_acquire_i8_global_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i8_global_sys_param_0];
+; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_acquire_i8_global_sys_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB124_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB124_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB124_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB124_1;
+; SM90-NEXT:  $L__BB124_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire
+    ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_acquire_i8_global_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_acquire_i8_global_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i8_global_cta_param_0];
+; SM90-NEXT:    fence.release.cta;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_acquire_i8_global_cta_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB125_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB125_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB125_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB125_1;
+; SM90-NEXT:  $L__BB125_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
+    ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_acquire_i8_global_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_acquire_i8_global_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i8_global_cluster_param_0];
+; SM90-NEXT:    fence.release.cluster;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_acquire_i8_global_cluster_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB126_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB126_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB126_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB126_1;
+; SM90-NEXT:  $L__BB126_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel acquire
+    ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_acquire_i8_global_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_acquire_i8_global_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i8_global_gpu_param_0];
+; SM90-NEXT:    fence.release.gpu;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_acquire_i8_global_gpu_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB127_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB127_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB127_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB127_1;
+; SM90-NEXT:  $L__BB127_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire
+    ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_acquire_i8_shared_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_sys_param_0];
+; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_acquire_i8_shared_sys_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB128_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB128_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB128_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB128_1;
+; SM90-NEXT:  $L__BB128_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire
+    ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_acquire_i8_shared_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0];
+; SM90-NEXT:    fence.release.cta;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_acquire_i8_shared_cta_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB129_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB129_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB129_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB129_1;
+; SM90-NEXT:  $L__BB129_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
+    ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_acquire_i8_shared_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_cluster_param_0];
+; SM90-NEXT:    fence.release.cluster;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_acquire_i8_shared_cluster_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB130_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB130_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB130_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB130_1;
+; SM90-NEXT:  $L__BB130_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel acquire
+    ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_acquire_i8_shared_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_gpu_param_0];
+; SM90-NEXT:    fence.release.gpu;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_acquire_i8_shared_gpu_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB131_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB131_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB131_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB131_1;
+; SM90-NEXT:  $L__BB131_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire
+    ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i8_generic_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_sys_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB132_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB132_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB132_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB132_1;
+; SM90-NEXT:  $L__BB132_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst
+    ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i8_generic_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_cta_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB133_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB133_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB133_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB133_1;
+; SM90-NEXT:  $L__BB133_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst
+    ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i8_generic_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_cluster_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB134_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB134_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB134_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB134_1;
+; SM90-NEXT:  $L__BB134_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel seq_cst
+    ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i8_generic_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_gpu_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB135_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB135_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB135_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB135_1;
+; SM90-NEXT:  $L__BB135_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst
+    ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i8_global_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_sys_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB136_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB136_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB136_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB136_1;
+; SM90-NEXT:  $L__BB136_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst
+    ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i8_global_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_cta_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB137_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB137_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB137_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB137_1;
+; SM90-NEXT:  $L__BB137_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst
+    ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i8_global_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_cluster_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB138_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB138_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB138_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB138_1;
+; SM90-NEXT:  $L__BB138_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel seq_cst
+    ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i8_global_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_gpu_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB139_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB139_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB139_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB139_1;
+; SM90-NEXT:  $L__BB139_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst
+    ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i8_shared_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_sys_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB140_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB140_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB140_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB140_1;
+; SM90-NEXT:  $L__BB140_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst
+    ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i8_shared_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_cta_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB141_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB141_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB141_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB141_1;
+; SM90-NEXT:  $L__BB141_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst
+    ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i8_shared_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_cluster_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB142_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB142_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB142_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB142_1;
+; SM90-NEXT:  $L__BB142_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel seq_cst
+    ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i8_shared_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_gpu_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB143_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB143_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB143_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB143_1;
+; SM90-NEXT:  $L__BB143_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst
+    ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_monotonic_i8_generic_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_sys_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB144_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB144_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB144_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB144_1;
+; SM90-NEXT:  $L__BB144_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic
+    ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_monotonic_i8_generic_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_cta_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB145_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB145_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB145_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB145_1;
+; SM90-NEXT:  $L__BB145_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic
+    ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_monotonic_i8_generic_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_cluster_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB146_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB146_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB146_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB146_1;
+; SM90-NEXT:  $L__BB146_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst monotonic
+    ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_monotonic_i8_generic_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_gpu_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB147_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB147_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB147_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB147_1;
+; SM90-NEXT:  $L__BB147_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic
+    ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_monotonic_i8_global_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_monotonic_i8_global_sys_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB148_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB148_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB148_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB148_1;
+; SM90-NEXT:  $L__BB148_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic
+    ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_monotonic_i8_global_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_monotonic_i8_global_cta_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB149_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB149_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB149_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB149_1;
+; SM90-NEXT:  $L__BB149_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic
+    ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_monotonic_i8_global_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_monotonic_i8_global_cluster_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB150_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB150_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB150_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB150_1;
+; SM90-NEXT:  $L__BB150_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst monotonic
+    ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_monotonic_i8_global_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_monotonic_i8_global_gpu_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB151_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB151_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB151_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB151_1;
+; SM90-NEXT:  $L__BB151_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic
+    ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_monotonic_i8_shared_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_sys_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB152_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB152_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB152_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB152_1;
+; SM90-NEXT:  $L__BB152_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic
+    ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_monotonic_i8_shared_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_cta_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB153_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB153_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB153_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB153_1;
+; SM90-NEXT:  $L__BB153_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic
+    ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_monotonic_i8_shared_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_cluster_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB154_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB154_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB154_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB154_1;
+; SM90-NEXT:  $L__BB154_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst monotonic
+    ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_monotonic_i8_shared_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_gpu_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB155_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB155_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB155_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB155_1;
+; SM90-NEXT:  $L__BB155_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic
+    ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_acquire_i8_generic_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_acquire_i8_generic_sys_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB156_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB156_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB156_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB156_1;
+; SM90-NEXT:  $L__BB156_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire
+    ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_acquire_i8_generic_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_acquire_i8_generic_cta_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB157_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB157_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB157_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB157_1;
+; SM90-NEXT:  $L__BB157_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire
+    ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_acquire_i8_generic_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_acquire_i8_generic_cluster_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB158_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB158_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB158_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB158_1;
+; SM90-NEXT:  $L__BB158_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst acquire
+    ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_acquire_i8_generic_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_acquire_i8_generic_gpu_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB159_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB159_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB159_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB159_1;
+; SM90-NEXT:  $L__BB159_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire
+    ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_acquire_i8_global_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_acquire_i8_global_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i8_global_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_acquire_i8_global_sys_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB160_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB160_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB160_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB160_1;
+; SM90-NEXT:  $L__BB160_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire
+    ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_acquire_i8_global_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_acquire_i8_global_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i8_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_acquire_i8_global_cta_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB161_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB161_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB161_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB161_1;
+; SM90-NEXT:  $L__BB161_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire
+    ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_acquire_i8_global_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_acquire_i8_global_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i8_global_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_acquire_i8_global_cluster_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB162_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB162_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB162_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB162_1;
+; SM90-NEXT:  $L__BB162_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst acquire
+    ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_acquire_i8_global_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_acquire_i8_global_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i8_global_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_acquire_i8_global_gpu_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB163_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB163_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB163_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB163_1;
+; SM90-NEXT:  $L__BB163_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire
+    ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_acquire_i8_shared_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_acquire_i8_shared_sys_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB164_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB164_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB164_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB164_1;
+; SM90-NEXT:  $L__BB164_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire
+    ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_acquire_i8_shared_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_acquire_i8_shared_cta_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB165_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB165_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB165_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB165_1;
+; SM90-NEXT:  $L__BB165_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire
+    ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_acquire_i8_shared_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_acquire_i8_shared_cluster_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB166_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB166_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB166_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB166_1;
+; SM90-NEXT:  $L__BB166_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst acquire
+    ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_acquire_i8_shared_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_acquire_i8_shared_gpu_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB167_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB167_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB167_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB167_1;
+; SM90-NEXT:  $L__BB167_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire
+    ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i8_generic_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_sys_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB168_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB168_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB168_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB168_1;
+; SM90-NEXT:  $L__BB168_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst
+    ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i8_generic_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_cta_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB169_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB169_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB169_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB169_1;
+; SM90-NEXT:  $L__BB169_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst
+    ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i8_generic_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_cluster_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB170_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB170_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB170_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB170_1;
+; SM90-NEXT:  $L__BB170_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst seq_cst
+    ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i8_generic_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_gpu_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB171_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB171_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB171_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB171_1;
+; SM90-NEXT:  $L__BB171_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst
+    ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i8_global_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_sys_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB172_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB172_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB172_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB172_1;
+; SM90-NEXT:  $L__BB172_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst
+    ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i8_global_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_cta_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB173_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB173_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB173_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB173_1;
+; SM90-NEXT:  $L__BB173_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst
+    ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i8_global_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_cluster_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB174_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB174_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB174_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB174_1;
+; SM90-NEXT:  $L__BB174_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst seq_cst
+    ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i8_global_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_gpu_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB175_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB175_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB175_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB175_1;
+; SM90-NEXT:  $L__BB175_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst
+    ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i8_shared_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_sys_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB176_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB176_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB176_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB176_1;
+; SM90-NEXT:  $L__BB176_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst
+    ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i8_shared_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_cta_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB177_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB177_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB177_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB177_1;
+; SM90-NEXT:  $L__BB177_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst
+    ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i8_shared_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_cluster_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB178_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB178_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB178_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB178_1;
+; SM90-NEXT:  $L__BB178_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst seq_cst
+    ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i8_shared_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_gpu_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB179_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB179_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB179_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB179_1;
+; SM90-NEXT:  $L__BB179_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst
+    ret i8 %new
+}
+
+define i16 @monotonic_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_monotonic_i16_generic_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_sys_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [monotonic_monotonic_i16_generic_sys_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB180_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB180_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB180_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB180_1;
+; SM90-NEXT:  $L__BB180_3: // %partword.cmpxchg.end
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic
+    ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_monotonic_i16_generic_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_cta_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [monotonic_monotonic_i16_generic_cta_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB181_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB181_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB181_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB181_1;
+; SM90-NEXT:  $L__BB181_3: // %partword.cmpxchg.end
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic
+    ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_monotonic_i16_generic_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_cluster_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [monotonic_monotonic_i16_generic_cluster_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB182_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB182_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB182_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB182_1;
+; SM90-NEXT:  $L__BB182_3: // %partword.cmpxchg.end
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic monotonic
+    ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_monotonic_i16_generic_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_gpu_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [monotonic_monotonic_i16_generic_gpu_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB183_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB183_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB183_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB183_1;
+; SM90-NEXT:  $L__BB183_3: // %partword.cmpxchg.end
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic
+    ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_monotonic_i16_global_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_monotonic_i16_global_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i16_global_sys_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [monotonic_monotonic_i16_global_sys_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB184_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB184_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB184_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB184_1;
+; SM90-NEXT:  $L__BB184_3: // %partword.cmpxchg.end
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic
+    ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_monotonic_i16_global_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_monotonic_i16_global_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i16_global_cta_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [monotonic_monotonic_i16_global_cta_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB185_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB185_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB185_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB185_1;
+; SM90-NEXT:  $L__BB185_3: // %partword.cmpxchg.end
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic
+    ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_monotonic_i16_global_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_monotonic_i16_global_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i16_global_cluster_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [monotonic_monotonic_i16_global_cluster_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB186_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB186_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB186_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB186_1;
+; SM90-NEXT:  $L__BB186_3: // %partword.cmpxchg.end
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic monotonic
+    ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_monotonic_i16_global_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_monotonic_i16_global_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i16_global_gpu_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [monotonic_monotonic_i16_global_gpu_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB187_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB187_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB187_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB187_1;
+; SM90-NEXT:  $L__BB187_3: // %partword.cmpxchg.end
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic
+    ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_monotonic_i16_shared_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_sys_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [monotonic_monotonic_i16_shared_sys_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB188_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB188_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB188_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB188_1;
+; SM90-NEXT:  $L__BB188_3: // %partword.cmpxchg.end
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic
+    ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_monotonic_i16_shared_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_cta_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [monotonic_monotonic_i16_shared_cta_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB189_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB189_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB189_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB189_1;
+; SM90-NEXT:  $L__BB189_3: // %partword.cmpxchg.end
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic
+    ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_monotonic_i16_shared_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_cluster_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [monotonic_monotonic_i16_shared_cluster_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB190_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB190_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB190_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB190_1;
+; SM90-NEXT:  $L__BB190_3: // %partword.cmpxchg.end
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic monotonic
+    ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_monotonic_i16_shared_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_gpu_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [monotonic_monotonic_i16_shared_gpu_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB191_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB191_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB191_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB191_1;
+; SM90-NEXT:  $L__BB191_3: // %partword.cmpxchg.end
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic
+    ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_acquire_i16_generic_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_acquire_i16_generic_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i16_generic_sys_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [monotonic_acquire_i16_generic_sys_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB192_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB192_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB192_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB192_1;
+; SM90-NEXT:  $L__BB192_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire
+    ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_acquire_i16_generic_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_acquire_i16_generic_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i16_generic_cta_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [monotonic_acquire_i16_generic_cta_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB193_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB193_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB193_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB193_1;
+; SM90-NEXT:  $L__BB193_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire
+    ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_acquire_i16_generic_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_acquire_i16_generic_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i16_generic_cluster_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [monotonic_acquire_i16_generic_cluster_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB194_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB194_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB194_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB194_1;
+; SM90-NEXT:  $L__BB194_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic acquire
+    ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_acquire_i16_generic_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_acquire_i16_generic_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i16_generic_gpu_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [monotonic_acquire_i16_generic_gpu_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB195_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB195_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB195_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB195_1;
+; SM90-NEXT:  $L__BB195_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire
+    ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_acquire_i16_global_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_acquire_i16_global_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i16_global_sys_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [monotonic_acquire_i16_global_sys_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB196_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB196_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB196_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB196_1;
+; SM90-NEXT:  $L__BB196_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire
+    ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_acquire_i16_global_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_acquire_i16_global_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i16_global_cta_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [monotonic_acquire_i16_global_cta_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB197_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB197_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB197_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB197_1;
+; SM90-NEXT:  $L__BB197_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire
+    ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_acquire_i16_global_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_acquire_i16_global_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i16_global_cluster_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [monotonic_acquire_i16_global_cluster_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB198_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB198_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB198_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB198_1;
+; SM90-NEXT:  $L__BB198_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic acquire
+    ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_acquire_i16_global_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_acquire_i16_global_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i16_global_gpu_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [monotonic_acquire_i16_global_gpu_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB199_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB199_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB199_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB199_1;
+; SM90-NEXT:  $L__BB199_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire
+    ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_acquire_i16_shared_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_acquire_i16_shared_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i16_shared_sys_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [monotonic_acquire_i16_shared_sys_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB200_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB200_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB200_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB200_1;
+; SM90-NEXT:  $L__BB200_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire
+    ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_acquire_i16_shared_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_acquire_i16_shared_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i16_shared_cta_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [monotonic_acquire_i16_shared_cta_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB201_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB201_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB201_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB201_1;
+; SM90-NEXT:  $L__BB201_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire
+    ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_acquire_i16_shared_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_acquire_i16_shared_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i16_shared_cluster_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [monotonic_acquire_i16_shared_cluster_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB202_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB202_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB202_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB202_1;
+; SM90-NEXT:  $L__BB202_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic acquire
+    ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_acquire_i16_shared_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_acquire_i16_shared_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i16_shared_gpu_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [monotonic_acquire_i16_shared_gpu_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB203_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB203_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB203_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB203_1;
+; SM90-NEXT:  $L__BB203_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire
+    ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_seq_cst_i16_generic_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_sys_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB204_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB204_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB204_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB204_1;
+; SM90-NEXT:  $L__BB204_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst
+    ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_seq_cst_i16_generic_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_cta_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB205_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB205_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB205_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB205_1;
+; SM90-NEXT:  $L__BB205_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst
+    ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_seq_cst_i16_generic_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_cluster_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB206_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB206_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB206_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB206_1;
+; SM90-NEXT:  $L__BB206_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic seq_cst
+    ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_seq_cst_i16_generic_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_gpu_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB207_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB207_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB207_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB207_1;
+; SM90-NEXT:  $L__BB207_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst
+    ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_seq_cst_i16_global_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [monotonic_seq_cst_i16_global_sys_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB208_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB208_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB208_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB208_1;
+; SM90-NEXT:  $L__BB208_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst
+    ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_seq_cst_i16_global_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u16 %r9, [monotonic_seq_cst_i16_global_cta_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB209_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB209_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB209_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB209_1;
+; SM90-NEXT:  $L__BB209_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst
+    ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_seq_cst_i16_global_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u16 %r9, [monotonic_seq_cst_i16_global_cluster_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB210_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB210_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB210_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB210_1;
+; SM90-NEXT:  $L__BB210_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic seq_cst
+    ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_seq_cst_i16_global_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u16 %r9, [monotonic_seq_cst_i16_global_gpu_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB211_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB211_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB211_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB211_1;
+; SM90-NEXT:  $L__BB211_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst
+    ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_seq_cst_i16_shared_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_sys_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB212_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB212_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB212_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB212_1;
+; SM90-NEXT:  $L__BB212_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst
+    ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_seq_cst_i16_shared_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_cta_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB213_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB213_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB213_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB213_1;
+; SM90-NEXT:  $L__BB213_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst
+    ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_seq_cst_i16_shared_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_cluster_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB214_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB214_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB214_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB214_1;
+; SM90-NEXT:  $L__BB214_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic seq_cst
+    ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_seq_cst_i16_shared_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_gpu_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB215_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB215_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB215_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB215_1;
+; SM90-NEXT:  $L__BB215_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst
+    ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_monotonic_i16_generic_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acquire_monotonic_i16_generic_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i16_generic_sys_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [acquire_monotonic_i16_generic_sys_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB216_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB216_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB216_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB216_1;
+; SM90-NEXT:  $L__BB216_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic
+    ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_monotonic_i16_generic_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acquire_monotonic_i16_generic_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i16_generic_cta_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [acquire_monotonic_i16_generic_cta_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB217_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB217_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB217_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB217_1;
+; SM90-NEXT:  $L__BB217_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic
+    ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_monotonic_i16_generic_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acquire_monotonic_i16_generic_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i16_generic_cluster_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [acquire_monotonic_i16_generic_cluster_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB218_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB218_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB218_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB218_1;
+; SM90-NEXT:  $L__BB218_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") acquire monotonic
+    ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_monotonic_i16_generic_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acquire_monotonic_i16_generic_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i16_generic_gpu_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [acquire_monotonic_i16_generic_gpu_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB219_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB219_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB219_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB219_1;
+; SM90-NEXT:  $L__BB219_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic
+    ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_monotonic_i16_global_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acquire_monotonic_i16_global_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i16_global_sys_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [acquire_monotonic_i16_global_sys_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB220_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB220_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB220_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB220_1;
+; SM90-NEXT:  $L__BB220_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic
+    ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_monotonic_i16_global_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acquire_monotonic_i16_global_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i16_global_cta_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [acquire_monotonic_i16_global_cta_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB221_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB221_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB221_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB221_1;
+; SM90-NEXT:  $L__BB221_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic
+    ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_monotonic_i16_global_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acquire_monotonic_i16_global_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i16_global_cluster_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [acquire_monotonic_i16_global_cluster_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB222_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB222_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB222_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB222_1;
+; SM90-NEXT:  $L__BB222_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") acquire monotonic
+    ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_monotonic_i16_global_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acquire_monotonic_i16_global_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i16_global_gpu_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [acquire_monotonic_i16_global_gpu_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB223_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB223_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB223_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB223_1;
+; SM90-NEXT:  $L__BB223_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic
+    ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_monotonic_i16_shared_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acquire_monotonic_i16_shared_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i16_shared_sys_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [acquire_monotonic_i16_shared_sys_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB224_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB224_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB224_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB224_1;
+; SM90-NEXT:  $L__BB224_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic
+    ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_monotonic_i16_shared_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acquire_monotonic_i16_shared_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i16_shared_cta_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [acquire_monotonic_i16_shared_cta_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB225_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB225_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB225_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB225_1;
+; SM90-NEXT:  $L__BB225_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic
+    ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_monotonic_i16_shared_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acquire_monotonic_i16_shared_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i16_shared_cluster_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [acquire_monotonic_i16_shared_cluster_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB226_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB226_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB226_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB226_1;
+; SM90-NEXT:  $L__BB226_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") acquire monotonic
+    ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_monotonic_i16_shared_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acquire_monotonic_i16_shared_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i16_shared_gpu_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [acquire_monotonic_i16_shared_gpu_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB227_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB227_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB227_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB227_1;
+; SM90-NEXT:  $L__BB227_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic
+    ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_acquire_i16_generic_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acquire_acquire_i16_generic_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i16_generic_sys_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [acquire_acquire_i16_generic_sys_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB228_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB228_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB228_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB228_1;
+; SM90-NEXT:  $L__BB228_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire acquire
+    ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_acquire_i16_generic_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acquire_acquire_i16_generic_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i16_generic_cta_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [acquire_acquire_i16_generic_cta_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB229_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB229_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB229_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB229_1;
+; SM90-NEXT:  $L__BB229_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire
+    ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_acquire_i16_generic_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acquire_acquire_i16_generic_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i16_generic_cluster_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [acquire_acquire_i16_generic_cluster_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB230_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB230_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB230_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB230_1;
+; SM90-NEXT:  $L__BB230_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") acquire acquire
+    ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_acquire_i16_generic_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acquire_acquire_i16_generic_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i16_generic_gpu_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [acquire_acquire_i16_generic_gpu_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB231_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB231_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB231_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB231_1;
+; SM90-NEXT:  $L__BB231_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire
+    ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_acquire_i16_global_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acquire_acquire_i16_global_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i16_global_sys_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [acquire_acquire_i16_global_sys_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB232_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB232_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB232_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB232_1;
+; SM90-NEXT:  $L__BB232_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire acquire
+    ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_acquire_i16_global_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acquire_acquire_i16_global_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i16_global_cta_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [acquire_acquire_i16_global_cta_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB233_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB233_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB233_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB233_1;
+; SM90-NEXT:  $L__BB233_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire
+    ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_acquire_i16_global_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acquire_acquire_i16_global_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i16_global_cluster_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [acquire_acquire_i16_global_cluster_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB234_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB234_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB234_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB234_1;
+; SM90-NEXT:  $L__BB234_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") acquire acquire
+    ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_acquire_i16_global_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acquire_acquire_i16_global_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i16_global_gpu_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [acquire_acquire_i16_global_gpu_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB235_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB235_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB235_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB235_1;
+; SM90-NEXT:  $L__BB235_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire
+    ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_acquire_i16_shared_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acquire_acquire_i16_shared_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i16_shared_sys_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [acquire_acquire_i16_shared_sys_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB236_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB236_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB236_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB236_1;
+; SM90-NEXT:  $L__BB236_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire acquire
+    ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_acquire_i16_shared_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acquire_acquire_i16_shared_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i16_shared_cta_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [acquire_acquire_i16_shared_cta_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB237_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB237_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB237_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB237_1;
+; SM90-NEXT:  $L__BB237_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire
+    ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_acquire_i16_shared_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acquire_acquire_i16_shared_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i16_shared_cluster_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [acquire_acquire_i16_shared_cluster_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB238_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB238_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB238_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB238_1;
+; SM90-NEXT:  $L__BB238_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") acquire acquire
+    ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_acquire_i16_shared_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acquire_acquire_i16_shared_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i16_shared_gpu_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [acquire_acquire_i16_shared_gpu_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB239_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB239_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB239_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB239_1;
+; SM90-NEXT:  $L__BB239_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire
+    ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_seq_cst_i16_generic_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [acquire_seq_cst_i16_generic_sys_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB240_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB240_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB240_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB240_1;
+; SM90-NEXT:  $L__BB240_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst
+    ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_seq_cst_i16_generic_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u16 %r9, [acquire_seq_cst_i16_generic_cta_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB241_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB241_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB241_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB241_1;
+; SM90-NEXT:  $L__BB241_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst
+    ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_seq_cst_i16_generic_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u16 %r9, [acquire_seq_cst_i16_generic_cluster_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB242_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB242_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB242_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB242_1;
+; SM90-NEXT:  $L__BB242_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") acquire seq_cst
+    ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_seq_cst_i16_generic_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u16 %r9, [acquire_seq_cst_i16_generic_gpu_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB243_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB243_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB243_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB243_1;
+; SM90-NEXT:  $L__BB243_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst
+    ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_seq_cst_i16_global_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acquire_seq_cst_i16_global_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i16_global_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [acquire_seq_cst_i16_global_sys_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB244_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB244_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB244_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB244_1;
+; SM90-NEXT:  $L__BB244_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst
+    ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_seq_cst_i16_global_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acquire_seq_cst_i16_global_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i16_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u16 %r9, [acquire_seq_cst_i16_global_cta_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB245_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB245_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB245_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB245_1;
+; SM90-NEXT:  $L__BB245_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst
+    ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_seq_cst_i16_global_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acquire_seq_cst_i16_global_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i16_global_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u16 %r9, [acquire_seq_cst_i16_global_cluster_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB246_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB246_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB246_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB246_1;
+; SM90-NEXT:  $L__BB246_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") acquire seq_cst
+    ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_seq_cst_i16_global_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acquire_seq_cst_i16_global_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i16_global_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u16 %r9, [acquire_seq_cst_i16_global_gpu_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB247_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB247_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB247_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB247_1;
+; SM90-NEXT:  $L__BB247_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst
+    ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_seq_cst_i16_shared_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [acquire_seq_cst_i16_shared_sys_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB248_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB248_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB248_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB248_1;
+; SM90-NEXT:  $L__BB248_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst
+    ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_seq_cst_i16_shared_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u16 %r9, [acquire_seq_cst_i16_shared_cta_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB249_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB249_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB249_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB249_1;
+; SM90-NEXT:  $L__BB249_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst
+    ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_seq_cst_i16_shared_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u16 %r9, [acquire_seq_cst_i16_shared_cluster_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB250_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB250_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB250_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB250_1;
+; SM90-NEXT:  $L__BB250_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") acquire seq_cst
+    ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_seq_cst_i16_shared_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u16 %r9, [acquire_seq_cst_i16_shared_gpu_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB251_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB251_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB251_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB251_1;
+; SM90-NEXT:  $L__BB251_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst
+    ret i16 %new
+}
+
+define i16 @release_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_monotonic_i16_generic_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [release_monotonic_i16_generic_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i16_generic_sys_param_0];
+; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [release_monotonic_i16_generic_sys_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB252_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB252_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB252_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB252_1;
+; SM90-NEXT:  $L__BB252_3: // %partword.cmpxchg.end
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release monotonic
+    ret i16 %new
+}
+
+define i16 @release_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_monotonic_i16_generic_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [release_monotonic_i16_generic_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i16_generic_cta_param_0];
+; SM90-NEXT:    fence.release.cta;
+; SM90-NEXT:    ld.param.u16 %r9, [release_monotonic_i16_generic_cta_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB253_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB253_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB253_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB253_1;
+; SM90-NEXT:  $L__BB253_3: // %partword.cmpxchg.end
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release monotonic
+    ret i16 %new
+}
+
+define i16 @release_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_monotonic_i16_generic_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [release_monotonic_i16_generic_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i16_generic_cluster_param_0];
+; SM90-NEXT:    fence.release.cluster;
+; SM90-NEXT:    ld.param.u16 %r9, [release_monotonic_i16_generic_cluster_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB254_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB254_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB254_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB254_1;
+; SM90-NEXT:  $L__BB254_3: // %partword.cmpxchg.end
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") release monotonic
+    ret i16 %new
+}
+
+define i16 @release_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_monotonic_i16_generic_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [release_monotonic_i16_generic_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i16_generic_gpu_param_0];
+; SM90-NEXT:    fence.release.gpu;
+; SM90-NEXT:    ld.param.u16 %r9, [release_monotonic_i16_generic_gpu_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB255_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB255_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB255_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB255_1;
+; SM90-NEXT:  $L__BB255_3: // %partword.cmpxchg.end
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release monotonic
+    ret i16 %new
+}
+
+define i16 @release_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_monotonic_i16_global_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [release_monotonic_i16_global_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i16_global_sys_param_0];
+; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [release_monotonic_i16_global_sys_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB256_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB256_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB256_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB256_1;
+; SM90-NEXT:  $L__BB256_3: // %partword.cmpxchg.end
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release monotonic
+    ret i16 %new
+}
+
+define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_monotonic_i16_global_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [release_monotonic_i16_global_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i16_global_cta_param_0];
+; SM90-NEXT:    fence.release.cta;
+; SM90-NEXT:    ld.param.u16 %r9, [release_monotonic_i16_global_cta_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB257_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB257_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB257_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB257_1;
+; SM90-NEXT:  $L__BB257_3: // %partword.cmpxchg.end
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic
+    ret i16 %new
+}
+
+define i16 @release_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_monotonic_i16_global_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [release_monotonic_i16_global_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i16_global_cluster_param_0];
+; SM90-NEXT:    fence.release.cluster;
+; SM90-NEXT:    ld.param.u16 %r9, [release_monotonic_i16_global_cluster_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB258_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB258_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB258_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB258_1;
+; SM90-NEXT:  $L__BB258_3: // %partword.cmpxchg.end
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") release monotonic
+    ret i16 %new
+}
+
+define i16 @release_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_monotonic_i16_global_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [release_monotonic_i16_global_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i16_global_gpu_param_0];
+; SM90-NEXT:    fence.release.gpu;
+; SM90-NEXT:    ld.param.u16 %r9, [release_monotonic_i16_global_gpu_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB259_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB259_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB259_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB259_1;
+; SM90-NEXT:  $L__BB259_3: // %partword.cmpxchg.end
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic
+    ret i16 %new
+}
+
+define i16 @release_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_monotonic_i16_shared_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [release_monotonic_i16_shared_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i16_shared_sys_param_0];
+; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [release_monotonic_i16_shared_sys_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB260_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB260_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB260_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB260_1;
+; SM90-NEXT:  $L__BB260_3: // %partword.cmpxchg.end
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release monotonic
+    ret i16 %new
+}
+
+define i16 @release_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_monotonic_i16_shared_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [release_monotonic_i16_shared_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i16_shared_cta_param_0];
+; SM90-NEXT:    fence.release.cta;
+; SM90-NEXT:    ld.param.u16 %r9, [release_monotonic_i16_shared_cta_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB261_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB261_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB261_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB261_1;
+; SM90-NEXT:  $L__BB261_3: // %partword.cmpxchg.end
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic
+    ret i16 %new
+}
+
+define i16 @release_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_monotonic_i16_shared_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [release_monotonic_i16_shared_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i16_shared_cluster_param_0];
+; SM90-NEXT:    fence.release.cluster;
+; SM90-NEXT:    ld.param.u16 %r9, [release_monotonic_i16_shared_cluster_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB262_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB262_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB262_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB262_1;
+; SM90-NEXT:  $L__BB262_3: // %partword.cmpxchg.end
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") release monotonic
+    ret i16 %new
+}
+
+define i16 @release_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_monotonic_i16_shared_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [release_monotonic_i16_shared_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i16_shared_gpu_param_0];
+; SM90-NEXT:    fence.release.gpu;
+; SM90-NEXT:    ld.param.u16 %r9, [release_monotonic_i16_shared_gpu_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB263_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB263_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB263_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB263_1;
+; SM90-NEXT:  $L__BB263_3: // %partword.cmpxchg.end
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic
+    ret i16 %new
+}
+
+define i16 @release_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_acquire_i16_generic_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [release_acquire_i16_generic_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i16_generic_sys_param_0];
+; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [release_acquire_i16_generic_sys_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB264_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB264_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB264_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB264_1;
+; SM90-NEXT:  $L__BB264_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release acquire
+    ret i16 %new
+}
+
+define i16 @release_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_acquire_i16_generic_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [release_acquire_i16_generic_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i16_generic_cta_param_0];
+; SM90-NEXT:    fence.release.cta;
+; SM90-NEXT:    ld.param.u16 %r9, [release_acquire_i16_generic_cta_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB265_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB265_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB265_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB265_1;
+; SM90-NEXT:  $L__BB265_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release acquire
+    ret i16 %new
+}
+
+define i16 @release_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_acquire_i16_generic_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [release_acquire_i16_generic_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i16_generic_cluster_param_0];
+; SM90-NEXT:    fence.release.cluster;
+; SM90-NEXT:    ld.param.u16 %r9, [release_acquire_i16_generic_cluster_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB266_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB266_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB266_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB266_1;
+; SM90-NEXT:  $L__BB266_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") release acquire
+    ret i16 %new
+}
+
+define i16 @release_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_acquire_i16_generic_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [release_acquire_i16_generic_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i16_generic_gpu_param_0];
+; SM90-NEXT:    fence.release.gpu;
+; SM90-NEXT:    ld.param.u16 %r9, [release_acquire_i16_generic_gpu_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB267_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB267_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB267_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB267_1;
+; SM90-NEXT:  $L__BB267_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release acquire
+    ret i16 %new
+}
+
+define i16 @release_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_acquire_i16_global_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [release_acquire_i16_global_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i16_global_sys_param_0];
+; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [release_acquire_i16_global_sys_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB268_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB268_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB268_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB268_1;
+; SM90-NEXT:  $L__BB268_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release acquire
+    ret i16 %new
+}
+
+define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_acquire_i16_global_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [release_acquire_i16_global_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i16_global_cta_param_0];
+; SM90-NEXT:    fence.release.cta;
+; SM90-NEXT:    ld.param.u16 %r9, [release_acquire_i16_global_cta_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB269_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB269_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB269_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB269_1;
+; SM90-NEXT:  $L__BB269_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release acquire
+    ret i16 %new
+}
+
+define i16 @release_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_acquire_i16_global_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [release_acquire_i16_global_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i16_global_cluster_param_0];
+; SM90-NEXT:    fence.release.cluster;
+; SM90-NEXT:    ld.param.u16 %r9, [release_acquire_i16_global_cluster_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB270_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB270_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB270_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB270_1;
+; SM90-NEXT:  $L__BB270_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") release acquire
+    ret i16 %new
+}
+
+define i16 @release_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_acquire_i16_global_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [release_acquire_i16_global_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i16_global_gpu_param_0];
+; SM90-NEXT:    fence.release.gpu;
+; SM90-NEXT:    ld.param.u16 %r9, [release_acquire_i16_global_gpu_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB271_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB271_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB271_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB271_1;
+; SM90-NEXT:  $L__BB271_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release acquire
+    ret i16 %new
+}
+
+define i16 @release_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_acquire_i16_shared_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [release_acquire_i16_shared_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i16_shared_sys_param_0];
+; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [release_acquire_i16_shared_sys_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB272_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB272_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB272_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB272_1;
+; SM90-NEXT:  $L__BB272_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release acquire
+    ret i16 %new
+}
+
+define i16 @release_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_acquire_i16_shared_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [release_acquire_i16_shared_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i16_shared_cta_param_0];
+; SM90-NEXT:    fence.release.cta;
+; SM90-NEXT:    ld.param.u16 %r9, [release_acquire_i16_shared_cta_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB273_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB273_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB273_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB273_1;
+; SM90-NEXT:  $L__BB273_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release acquire
+    ret i16 %new
+}
+
+define i16 @release_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_acquire_i16_shared_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [release_acquire_i16_shared_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i16_shared_cluster_param_0];
+; SM90-NEXT:    fence.release.cluster;
+; SM90-NEXT:    ld.param.u16 %r9, [release_acquire_i16_shared_cluster_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB274_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB274_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB274_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB274_1;
+; SM90-NEXT:  $L__BB274_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") release acquire
+    ret i16 %new
+}
+
+define i16 @release_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_acquire_i16_shared_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [release_acquire_i16_shared_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i16_shared_gpu_param_0];
+; SM90-NEXT:    fence.release.gpu;
+; SM90-NEXT:    ld.param.u16 %r9, [release_acquire_i16_shared_gpu_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB275_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB275_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB275_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB275_1;
+; SM90-NEXT:  $L__BB275_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release acquire
+    ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_seq_cst_i16_generic_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [release_seq_cst_i16_generic_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i16_generic_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [release_seq_cst_i16_generic_sys_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB276_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB276_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB276_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB276_1;
+; SM90-NEXT:  $L__BB276_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release seq_cst
+    ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_seq_cst_i16_generic_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [release_seq_cst_i16_generic_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i16_generic_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u16 %r9, [release_seq_cst_i16_generic_cta_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB277_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB277_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB277_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB277_1;
+; SM90-NEXT:  $L__BB277_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst
+    ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_seq_cst_i16_generic_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [release_seq_cst_i16_generic_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i16_generic_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u16 %r9, [release_seq_cst_i16_generic_cluster_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB278_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB278_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB278_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB278_1;
+; SM90-NEXT:  $L__BB278_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") release seq_cst
+    ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_seq_cst_i16_generic_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [release_seq_cst_i16_generic_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i16_generic_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u16 %r9, [release_seq_cst_i16_generic_gpu_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB279_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB279_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB279_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB279_1;
+; SM90-NEXT:  $L__BB279_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst
+    ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_seq_cst_i16_global_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [release_seq_cst_i16_global_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i16_global_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [release_seq_cst_i16_global_sys_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB280_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB280_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB280_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB280_1;
+; SM90-NEXT:  $L__BB280_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release seq_cst
+    ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_seq_cst_i16_global_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [release_seq_cst_i16_global_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i16_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u16 %r9, [release_seq_cst_i16_global_cta_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB281_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB281_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB281_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB281_1;
+; SM90-NEXT:  $L__BB281_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst
+    ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_seq_cst_i16_global_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [release_seq_cst_i16_global_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i16_global_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u16 %r9, [release_seq_cst_i16_global_cluster_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB282_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB282_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB282_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB282_1;
+; SM90-NEXT:  $L__BB282_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") release seq_cst
+    ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_seq_cst_i16_global_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [release_seq_cst_i16_global_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i16_global_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u16 %r9, [release_seq_cst_i16_global_gpu_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB283_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB283_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB283_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB283_1;
+; SM90-NEXT:  $L__BB283_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst
+    ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_seq_cst_i16_shared_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [release_seq_cst_i16_shared_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i16_shared_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [release_seq_cst_i16_shared_sys_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB284_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB284_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB284_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB284_1;
+; SM90-NEXT:  $L__BB284_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release seq_cst
+    ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_seq_cst_i16_shared_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [release_seq_cst_i16_shared_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i16_shared_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u16 %r9, [release_seq_cst_i16_shared_cta_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB285_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB285_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB285_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB285_1;
+; SM90-NEXT:  $L__BB285_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst
+    ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_seq_cst_i16_shared_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [release_seq_cst_i16_shared_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i16_shared_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u16 %r9, [release_seq_cst_i16_shared_cluster_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB286_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB286_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB286_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB286_1;
+; SM90-NEXT:  $L__BB286_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") release seq_cst
+    ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_seq_cst_i16_shared_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [release_seq_cst_i16_shared_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i16_shared_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u16 %r9, [release_seq_cst_i16_shared_gpu_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB287_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB287_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB287_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB287_1;
+; SM90-NEXT:  $L__BB287_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst
+    ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_monotonic_i16_generic_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_sys_param_0];
+; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_sys_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB288_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB288_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB288_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB288_1;
+; SM90-NEXT:  $L__BB288_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic
+    ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_monotonic_i16_generic_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_cta_param_0];
+; SM90-NEXT:    fence.release.cta;
+; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_cta_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB289_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB289_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB289_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB289_1;
+; SM90-NEXT:  $L__BB289_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic
+    ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_monotonic_i16_generic_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_cluster_param_0];
+; SM90-NEXT:    fence.release.cluster;
+; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_cluster_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB290_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB290_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB290_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB290_1;
+; SM90-NEXT:  $L__BB290_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel monotonic
+    ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_monotonic_i16_generic_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_gpu_param_0];
+; SM90-NEXT:    fence.release.gpu;
+; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_gpu_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB291_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB291_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB291_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB291_1;
+; SM90-NEXT:  $L__BB291_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic
+    ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_monotonic_i16_global_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_sys_param_0];
+; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_monotonic_i16_global_sys_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB292_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB292_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB292_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB292_1;
+; SM90-NEXT:  $L__BB292_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic
+    ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_monotonic_i16_global_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0];
+; SM90-NEXT:    fence.release.cta;
+; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_monotonic_i16_global_cta_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB293_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB293_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB293_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB293_1;
+; SM90-NEXT:  $L__BB293_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic
+    ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_monotonic_i16_global_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_cluster_param_0];
+; SM90-NEXT:    fence.release.cluster;
+; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_monotonic_i16_global_cluster_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB294_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB294_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB294_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB294_1;
+; SM90-NEXT:  $L__BB294_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel monotonic
+    ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_monotonic_i16_global_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_gpu_param_0];
+; SM90-NEXT:    fence.release.gpu;
+; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_monotonic_i16_global_gpu_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB295_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB295_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB295_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB295_1;
+; SM90-NEXT:  $L__BB295_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic
+    ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_monotonic_i16_shared_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_sys_param_0];
+; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_sys_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB296_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB296_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB296_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB296_1;
+; SM90-NEXT:  $L__BB296_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic
+    ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_monotonic_i16_shared_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_cta_param_0];
+; SM90-NEXT:    fence.release.cta;
+; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_cta_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB297_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB297_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB297_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB297_1;
+; SM90-NEXT:  $L__BB297_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic
+    ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_monotonic_i16_shared_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_cluster_param_0];
+; SM90-NEXT:    fence.release.cluster;
+; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_cluster_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB298_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB298_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB298_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB298_1;
+; SM90-NEXT:  $L__BB298_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel monotonic
+    ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_monotonic_i16_shared_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_gpu_param_0];
+; SM90-NEXT:    fence.release.gpu;
+; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_gpu_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB299_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB299_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB299_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB299_1;
+; SM90-NEXT:  $L__BB299_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic
+    ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_acquire_i16_generic_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_sys_param_0];
+; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_acquire_i16_generic_sys_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB300_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB300_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB300_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB300_1;
+; SM90-NEXT:  $L__BB300_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire
+    ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_acquire_i16_generic_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_cta_param_0];
+; SM90-NEXT:    fence.release.cta;
+; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_acquire_i16_generic_cta_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB301_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB301_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB301_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB301_1;
+; SM90-NEXT:  $L__BB301_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire
+    ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_acquire_i16_generic_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_cluster_param_0];
+; SM90-NEXT:    fence.release.cluster;
+; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_acquire_i16_generic_cluster_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB302_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB302_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB302_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB302_1;
+; SM90-NEXT:  $L__BB302_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel acquire
+    ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_acquire_i16_generic_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_gpu_param_0];
+; SM90-NEXT:    fence.release.gpu;
+; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_acquire_i16_generic_gpu_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB303_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB303_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB303_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB303_1;
+; SM90-NEXT:  $L__BB303_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire
+    ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_acquire_i16_global_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_acquire_i16_global_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i16_global_sys_param_0];
+; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_acquire_i16_global_sys_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB304_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB304_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB304_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB304_1;
+; SM90-NEXT:  $L__BB304_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire
+    ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_acquire_i16_global_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_acquire_i16_global_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i16_global_cta_param_0];
+; SM90-NEXT:    fence.release.cta;
+; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_acquire_i16_global_cta_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB305_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB305_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB305_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB305_1;
+; SM90-NEXT:  $L__BB305_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire
+    ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_acquire_i16_global_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_acquire_i16_global_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i16_global_cluster_param_0];
+; SM90-NEXT:    fence.release.cluster;
+; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_acquire_i16_global_cluster_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB306_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB306_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB306_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB306_1;
+; SM90-NEXT:  $L__BB306_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel acquire
+    ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_acquire_i16_global_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_acquire_i16_global_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i16_global_gpu_param_0];
+; SM90-NEXT:    fence.release.gpu;
+; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_acquire_i16_global_gpu_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB307_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB307_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB307_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB307_1;
+; SM90-NEXT:  $L__BB307_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire
+    ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_acquire_i16_shared_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_sys_param_0];
+; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_acquire_i16_shared_sys_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB308_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB308_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB308_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB308_1;
+; SM90-NEXT:  $L__BB308_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire
+    ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_acquire_i16_shared_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_cta_param_0];
+; SM90-NEXT:    fence.release.cta;
+; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_acquire_i16_shared_cta_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB309_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB309_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB309_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB309_1;
+; SM90-NEXT:  $L__BB309_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire
+    ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_acquire_i16_shared_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_cluster_param_0];
+; SM90-NEXT:    fence.release.cluster;
+; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_acquire_i16_shared_cluster_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB310_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB310_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB310_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB310_1;
+; SM90-NEXT:  $L__BB310_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel acquire
+    ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_acquire_i16_shared_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_gpu_param_0];
+; SM90-NEXT:    fence.release.gpu;
+; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_acquire_i16_shared_gpu_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB311_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB311_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB311_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB311_1;
+; SM90-NEXT:  $L__BB311_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire
+    ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i16_generic_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_sys_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB312_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB312_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB312_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB312_1;
+; SM90-NEXT:  $L__BB312_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst
+    ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i16_generic_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_cta_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB313_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB313_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB313_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB313_1;
+; SM90-NEXT:  $L__BB313_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst
+    ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i16_generic_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_cluster_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB314_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB314_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB314_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB314_1;
+; SM90-NEXT:  $L__BB314_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel seq_cst
+    ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i16_generic_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_gpu_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB315_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB315_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB315_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB315_1;
+; SM90-NEXT:  $L__BB315_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst
+    ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i16_global_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_sys_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB316_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB316_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB316_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB316_1;
+; SM90-NEXT:  $L__BB316_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst
+    ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i16_global_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_cta_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB317_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB317_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB317_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB317_1;
+; SM90-NEXT:  $L__BB317_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst
+    ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i16_global_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_cluster_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB318_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB318_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB318_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB318_1;
+; SM90-NEXT:  $L__BB318_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel seq_cst
+    ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i16_global_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_gpu_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB319_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB319_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB319_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB319_1;
+; SM90-NEXT:  $L__BB319_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst
+    ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i16_shared_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_sys_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB320_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB320_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB320_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB320_1;
+; SM90-NEXT:  $L__BB320_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst
+    ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i16_shared_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_cta_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB321_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB321_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB321_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB321_1;
+; SM90-NEXT:  $L__BB321_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst
+    ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i16_shared_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_cluster_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB322_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB322_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB322_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB322_1;
+; SM90-NEXT:  $L__BB322_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel seq_cst
+    ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i16_shared_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_gpu_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB323_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB323_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB323_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB323_1;
+; SM90-NEXT:  $L__BB323_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst
+    ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_monotonic_i16_generic_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_sys_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB324_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB324_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB324_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB324_1;
+; SM90-NEXT:  $L__BB324_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic
+    ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_monotonic_i16_generic_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_cta_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB325_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB325_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB325_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB325_1;
+; SM90-NEXT:  $L__BB325_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic
+    ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_monotonic_i16_generic_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_cluster_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB326_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB326_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB326_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB326_1;
+; SM90-NEXT:  $L__BB326_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst monotonic
+    ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_monotonic_i16_generic_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_gpu_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB327_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB327_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB327_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB327_1;
+; SM90-NEXT:  $L__BB327_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic
+    ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_monotonic_i16_global_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_monotonic_i16_global_sys_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB328_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB328_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB328_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB328_1;
+; SM90-NEXT:  $L__BB328_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic
+    ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_monotonic_i16_global_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_monotonic_i16_global_cta_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB329_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB329_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB329_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB329_1;
+; SM90-NEXT:  $L__BB329_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic
+    ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_monotonic_i16_global_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_monotonic_i16_global_cluster_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB330_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB330_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB330_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB330_1;
+; SM90-NEXT:  $L__BB330_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst monotonic
+    ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_monotonic_i16_global_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_monotonic_i16_global_gpu_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB331_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB331_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB331_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB331_1;
+; SM90-NEXT:  $L__BB331_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic
+    ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_monotonic_i16_shared_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_sys_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB332_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB332_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB332_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB332_1;
+; SM90-NEXT:  $L__BB332_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic
+    ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_monotonic_i16_shared_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_cta_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB333_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB333_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB333_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB333_1;
+; SM90-NEXT:  $L__BB333_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic
+    ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_monotonic_i16_shared_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_cluster_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB334_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB334_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB334_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB334_1;
+; SM90-NEXT:  $L__BB334_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst monotonic
+    ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_monotonic_i16_shared_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_gpu_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB335_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB335_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB335_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB335_1;
+; SM90-NEXT:  $L__BB335_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic
+    ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_acquire_i16_generic_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_acquire_i16_generic_sys_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB336_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB336_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB336_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB336_1;
+; SM90-NEXT:  $L__BB336_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire
+    ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_acquire_i16_generic_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_acquire_i16_generic_cta_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB337_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB337_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB337_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB337_1;
+; SM90-NEXT:  $L__BB337_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire
+    ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_acquire_i16_generic_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_acquire_i16_generic_cluster_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB338_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB338_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB338_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB338_1;
+; SM90-NEXT:  $L__BB338_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst acquire
+    ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_acquire_i16_generic_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_acquire_i16_generic_gpu_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB339_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB339_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB339_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB339_1;
+; SM90-NEXT:  $L__BB339_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire
+    ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_acquire_i16_global_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_acquire_i16_global_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i16_global_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_acquire_i16_global_sys_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB340_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB340_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB340_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB340_1;
+; SM90-NEXT:  $L__BB340_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire
+    ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_acquire_i16_global_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_acquire_i16_global_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i16_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_acquire_i16_global_cta_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB341_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB341_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB341_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB341_1;
+; SM90-NEXT:  $L__BB341_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire
+    ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_acquire_i16_global_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_acquire_i16_global_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i16_global_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_acquire_i16_global_cluster_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB342_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB342_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB342_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB342_1;
+; SM90-NEXT:  $L__BB342_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst acquire
+    ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_acquire_i16_global_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_acquire_i16_global_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i16_global_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_acquire_i16_global_gpu_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB343_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB343_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB343_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB343_1;
+; SM90-NEXT:  $L__BB343_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire
+    ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_acquire_i16_shared_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_acquire_i16_shared_sys_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB344_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB344_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB344_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB344_1;
+; SM90-NEXT:  $L__BB344_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire
+    ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_acquire_i16_shared_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_acquire_i16_shared_cta_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB345_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB345_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB345_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB345_1;
+; SM90-NEXT:  $L__BB345_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire
+    ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_acquire_i16_shared_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_acquire_i16_shared_cluster_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB346_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB346_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB346_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB346_1;
+; SM90-NEXT:  $L__BB346_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst acquire
+    ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_acquire_i16_shared_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_acquire_i16_shared_gpu_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB347_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB347_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB347_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB347_1;
+; SM90-NEXT:  $L__BB347_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire
+    ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i16_generic_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_sys_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB348_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB348_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB348_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB348_1;
+; SM90-NEXT:  $L__BB348_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst
+    ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i16_generic_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_cta_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB349_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB349_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB349_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB349_1;
+; SM90-NEXT:  $L__BB349_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst
+    ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i16_generic_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_cluster_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB350_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB350_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB350_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB350_1;
+; SM90-NEXT:  $L__BB350_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst seq_cst
+    ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i16_generic_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_gpu_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB351_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB351_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB351_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB351_1;
+; SM90-NEXT:  $L__BB351_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst
+    ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i16_global_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_sys_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB352_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB352_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB352_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB352_1;
+; SM90-NEXT:  $L__BB352_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst
+    ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i16_global_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_cta_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB353_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB353_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB353_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB353_1;
+; SM90-NEXT:  $L__BB353_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst
+    ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i16_global_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_cluster_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB354_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB354_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB354_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB354_1;
+; SM90-NEXT:  $L__BB354_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst seq_cst
+    ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i16_global_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_gpu_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB355_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB355_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB355_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB355_1;
+; SM90-NEXT:  $L__BB355_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst
+    ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i16_shared_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_sys_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_sys_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB356_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB356_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB356_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB356_1;
+; SM90-NEXT:  $L__BB356_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst
+    ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i16_shared_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_cta_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_cta_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB357_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB357_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB357_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB357_1;
+; SM90-NEXT:  $L__BB357_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst
+    ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i16_shared_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_cluster_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_cluster_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB358_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB358_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB358_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB358_1;
+; SM90-NEXT:  $L__BB358_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst seq_cst
+    ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i16_shared_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_gpu_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_gpu_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB359_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB359_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB359_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB359_1;
+; SM90-NEXT:  $L__BB359_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst
+    ret i16 %new
+}
+
+define i32 @monotonic_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_monotonic_i32_generic_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_sys_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [monotonic_monotonic_i32_generic_sys_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [monotonic_monotonic_i32_generic_sys_param_2];
+; SM90-NEXT:    atom.relaxed.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic
+    ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_monotonic_i32_generic_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_cta_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [monotonic_monotonic_i32_generic_cta_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [monotonic_monotonic_i32_generic_cta_param_2];
+; SM90-NEXT:    atom.relaxed.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic
+    ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_monotonic_i32_generic_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_cluster_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [monotonic_monotonic_i32_generic_cluster_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [monotonic_monotonic_i32_generic_cluster_param_2];
+; SM90-NEXT:    atom.relaxed.cluster.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic monotonic
+    ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_monotonic_i32_generic_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_gpu_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [monotonic_monotonic_i32_generic_gpu_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [monotonic_monotonic_i32_generic_gpu_param_2];
+; SM90-NEXT:    atom.relaxed.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic
+    ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_monotonic_i32_global_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i32_global_sys_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [monotonic_monotonic_i32_global_sys_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [monotonic_monotonic_i32_global_sys_param_2];
+; SM90-NEXT:    atom.relaxed.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic
+    ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_monotonic_i32_global_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i32_global_cta_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [monotonic_monotonic_i32_global_cta_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [monotonic_monotonic_i32_global_cta_param_2];
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic
+    ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_monotonic_i32_global_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i32_global_cluster_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [monotonic_monotonic_i32_global_cluster_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [monotonic_monotonic_i32_global_cluster_param_2];
+; SM90-NEXT:    atom.relaxed.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic monotonic
+    ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_monotonic_i32_global_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i32_global_gpu_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [monotonic_monotonic_i32_global_gpu_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [monotonic_monotonic_i32_global_gpu_param_2];
+; SM90-NEXT:    atom.relaxed.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic
+    ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_monotonic_i32_shared_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_sys_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [monotonic_monotonic_i32_shared_sys_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [monotonic_monotonic_i32_shared_sys_param_2];
+; SM90-NEXT:    atom.relaxed.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic
+    ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_monotonic_i32_shared_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_cta_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [monotonic_monotonic_i32_shared_cta_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [monotonic_monotonic_i32_shared_cta_param_2];
+; SM90-NEXT:    atom.relaxed.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic
+    ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_monotonic_i32_shared_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_cluster_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [monotonic_monotonic_i32_shared_cluster_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [monotonic_monotonic_i32_shared_cluster_param_2];
+; SM90-NEXT:    atom.relaxed.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic monotonic
+    ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_monotonic_i32_shared_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_gpu_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [monotonic_monotonic_i32_shared_gpu_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [monotonic_monotonic_i32_shared_gpu_param_2];
+; SM90-NEXT:    atom.relaxed.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic
+    ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_acquire_i32_generic_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i32_generic_sys_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [monotonic_acquire_i32_generic_sys_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [monotonic_acquire_i32_generic_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire
+    ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_acquire_i32_generic_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i32_generic_cta_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [monotonic_acquire_i32_generic_cta_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [monotonic_acquire_i32_generic_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire
+    ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_acquire_i32_generic_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i32_generic_cluster_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [monotonic_acquire_i32_generic_cluster_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [monotonic_acquire_i32_generic_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic acquire
+    ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_acquire_i32_generic_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i32_generic_gpu_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [monotonic_acquire_i32_generic_gpu_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [monotonic_acquire_i32_generic_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire
+    ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_acquire_i32_global_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i32_global_sys_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [monotonic_acquire_i32_global_sys_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [monotonic_acquire_i32_global_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire
+    ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_acquire_i32_global_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i32_global_cta_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [monotonic_acquire_i32_global_cta_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [monotonic_acquire_i32_global_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire
+    ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_acquire_i32_global_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i32_global_cluster_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [monotonic_acquire_i32_global_cluster_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [monotonic_acquire_i32_global_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic acquire
+    ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_acquire_i32_global_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i32_global_gpu_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [monotonic_acquire_i32_global_gpu_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [monotonic_acquire_i32_global_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire
+    ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_acquire_i32_shared_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i32_shared_sys_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [monotonic_acquire_i32_shared_sys_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [monotonic_acquire_i32_shared_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire
+    ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_acquire_i32_shared_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i32_shared_cta_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [monotonic_acquire_i32_shared_cta_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [monotonic_acquire_i32_shared_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire
+    ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_acquire_i32_shared_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i32_shared_cluster_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [monotonic_acquire_i32_shared_cluster_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [monotonic_acquire_i32_shared_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic acquire
+    ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_acquire_i32_shared_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i32_shared_gpu_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [monotonic_acquire_i32_shared_gpu_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [monotonic_acquire_i32_shared_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire
+    ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_seq_cst_i32_generic_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_sys_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst
+    ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_seq_cst_i32_generic_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_cta_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst
+    ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_seq_cst_i32_generic_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_cluster_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic seq_cst
+    ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_seq_cst_i32_generic_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_gpu_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst
+    ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_seq_cst_i32_global_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u32 %r1, [monotonic_seq_cst_i32_global_sys_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [monotonic_seq_cst_i32_global_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst
+    ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_seq_cst_i32_global_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u32 %r1, [monotonic_seq_cst_i32_global_cta_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [monotonic_seq_cst_i32_global_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst
+    ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_seq_cst_i32_global_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u32 %r1, [monotonic_seq_cst_i32_global_cluster_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [monotonic_seq_cst_i32_global_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic seq_cst
+    ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_seq_cst_i32_global_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u32 %r1, [monotonic_seq_cst_i32_global_gpu_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [monotonic_seq_cst_i32_global_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst
+    ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_seq_cst_i32_shared_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_sys_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst
+    ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_seq_cst_i32_shared_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_cta_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst
+    ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_seq_cst_i32_shared_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_cluster_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic seq_cst
+    ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_seq_cst_i32_shared_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_gpu_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst
+    ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_monotonic_i32_generic_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i32_generic_sys_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acquire_monotonic_i32_generic_sys_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acquire_monotonic_i32_generic_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic
+    ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_monotonic_i32_generic_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i32_generic_cta_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acquire_monotonic_i32_generic_cta_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acquire_monotonic_i32_generic_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic
+    ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_monotonic_i32_generic_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i32_generic_cluster_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acquire_monotonic_i32_generic_cluster_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acquire_monotonic_i32_generic_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") acquire monotonic
+    ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_monotonic_i32_generic_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i32_generic_gpu_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acquire_monotonic_i32_generic_gpu_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acquire_monotonic_i32_generic_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic
+    ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_monotonic_i32_global_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i32_global_sys_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acquire_monotonic_i32_global_sys_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acquire_monotonic_i32_global_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic
+    ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_monotonic_i32_global_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i32_global_cta_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acquire_monotonic_i32_global_cta_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acquire_monotonic_i32_global_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic
+    ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_monotonic_i32_global_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i32_global_cluster_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acquire_monotonic_i32_global_cluster_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acquire_monotonic_i32_global_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") acquire monotonic
+    ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_monotonic_i32_global_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i32_global_gpu_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acquire_monotonic_i32_global_gpu_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acquire_monotonic_i32_global_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic
+    ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_monotonic_i32_shared_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i32_shared_sys_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acquire_monotonic_i32_shared_sys_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acquire_monotonic_i32_shared_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic
+    ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_monotonic_i32_shared_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i32_shared_cta_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acquire_monotonic_i32_shared_cta_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acquire_monotonic_i32_shared_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic
+    ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_monotonic_i32_shared_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i32_shared_cluster_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acquire_monotonic_i32_shared_cluster_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acquire_monotonic_i32_shared_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") acquire monotonic
+    ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_monotonic_i32_shared_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i32_shared_gpu_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acquire_monotonic_i32_shared_gpu_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acquire_monotonic_i32_shared_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic
+    ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_acquire_i32_generic_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i32_generic_sys_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acquire_acquire_i32_generic_sys_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acquire_acquire_i32_generic_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire acquire
+    ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_acquire_i32_generic_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i32_generic_cta_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acquire_acquire_i32_generic_cta_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acquire_acquire_i32_generic_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire
+    ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_acquire_i32_generic_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i32_generic_cluster_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acquire_acquire_i32_generic_cluster_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acquire_acquire_i32_generic_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") acquire acquire
+    ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_acquire_i32_generic_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i32_generic_gpu_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acquire_acquire_i32_generic_gpu_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acquire_acquire_i32_generic_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire
+    ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_acquire_i32_global_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i32_global_sys_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acquire_acquire_i32_global_sys_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acquire_acquire_i32_global_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire acquire
+    ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_acquire_i32_global_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i32_global_cta_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acquire_acquire_i32_global_cta_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acquire_acquire_i32_global_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire
+    ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_acquire_i32_global_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i32_global_cluster_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acquire_acquire_i32_global_cluster_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acquire_acquire_i32_global_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") acquire acquire
+    ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_acquire_i32_global_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i32_global_gpu_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acquire_acquire_i32_global_gpu_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acquire_acquire_i32_global_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire
+    ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_acquire_i32_shared_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i32_shared_sys_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acquire_acquire_i32_shared_sys_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acquire_acquire_i32_shared_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire acquire
+    ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_acquire_i32_shared_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i32_shared_cta_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acquire_acquire_i32_shared_cta_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acquire_acquire_i32_shared_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire
+    ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_acquire_i32_shared_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i32_shared_cluster_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acquire_acquire_i32_shared_cluster_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acquire_acquire_i32_shared_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") acquire acquire
+    ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_acquire_i32_shared_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i32_shared_gpu_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acquire_acquire_i32_shared_gpu_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acquire_acquire_i32_shared_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire
+    ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_seq_cst_i32_generic_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u32 %r1, [acquire_seq_cst_i32_generic_sys_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acquire_seq_cst_i32_generic_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst
+    ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_seq_cst_i32_generic_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u32 %r1, [acquire_seq_cst_i32_generic_cta_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acquire_seq_cst_i32_generic_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst
+    ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_seq_cst_i32_generic_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u32 %r1, [acquire_seq_cst_i32_generic_cluster_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acquire_seq_cst_i32_generic_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") acquire seq_cst
+    ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_seq_cst_i32_generic_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u32 %r1, [acquire_seq_cst_i32_generic_gpu_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acquire_seq_cst_i32_generic_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst
+    ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_seq_cst_i32_global_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i32_global_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u32 %r1, [acquire_seq_cst_i32_global_sys_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acquire_seq_cst_i32_global_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst
+    ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_seq_cst_i32_global_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i32_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u32 %r1, [acquire_seq_cst_i32_global_cta_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acquire_seq_cst_i32_global_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst
+    ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_seq_cst_i32_global_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i32_global_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u32 %r1, [acquire_seq_cst_i32_global_cluster_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acquire_seq_cst_i32_global_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") acquire seq_cst
+    ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_seq_cst_i32_global_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i32_global_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u32 %r1, [acquire_seq_cst_i32_global_gpu_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acquire_seq_cst_i32_global_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst
+    ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_seq_cst_i32_shared_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u32 %r1, [acquire_seq_cst_i32_shared_sys_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acquire_seq_cst_i32_shared_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst
+    ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_seq_cst_i32_shared_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u32 %r1, [acquire_seq_cst_i32_shared_cta_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acquire_seq_cst_i32_shared_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst
+    ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_seq_cst_i32_shared_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u32 %r1, [acquire_seq_cst_i32_shared_cluster_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acquire_seq_cst_i32_shared_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") acquire seq_cst
+    ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_seq_cst_i32_shared_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u32 %r1, [acquire_seq_cst_i32_shared_gpu_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acquire_seq_cst_i32_shared_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst
+    ret i32 %new
+}
+
+define i32 @release_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_monotonic_i32_generic_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_monotonic_i32_generic_sys_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [release_monotonic_i32_generic_sys_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [release_monotonic_i32_generic_sys_param_2];
+; SM90-NEXT:    atom.release.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release monotonic
+    ret i32 %new
+}
+
+define i32 @release_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_monotonic_i32_generic_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_monotonic_i32_generic_cta_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [release_monotonic_i32_generic_cta_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [release_monotonic_i32_generic_cta_param_2];
+; SM90-NEXT:    atom.release.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release monotonic
+    ret i32 %new
+}
+
+define i32 @release_monotonic_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_monotonic_i32_generic_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_monotonic_i32_generic_cluster_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [release_monotonic_i32_generic_cluster_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [release_monotonic_i32_generic_cluster_param_2];
+; SM90-NEXT:    atom.release.cluster.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") release monotonic
+    ret i32 %new
+}
+
+define i32 @release_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_monotonic_i32_generic_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_monotonic_i32_generic_gpu_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [release_monotonic_i32_generic_gpu_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [release_monotonic_i32_generic_gpu_param_2];
+; SM90-NEXT:    atom.release.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release monotonic
+    ret i32 %new
+}
+
+define i32 @release_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_monotonic_i32_global_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_monotonic_i32_global_sys_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [release_monotonic_i32_global_sys_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [release_monotonic_i32_global_sys_param_2];
+; SM90-NEXT:    atom.release.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release monotonic
+    ret i32 %new
+}
+
+define i32 @release_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_monotonic_i32_global_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_monotonic_i32_global_cta_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [release_monotonic_i32_global_cta_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [release_monotonic_i32_global_cta_param_2];
+; SM90-NEXT:    atom.release.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic
+    ret i32 %new
+}
+
+define i32 @release_monotonic_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_monotonic_i32_global_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_monotonic_i32_global_cluster_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [release_monotonic_i32_global_cluster_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [release_monotonic_i32_global_cluster_param_2];
+; SM90-NEXT:    atom.release.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") release monotonic
+    ret i32 %new
+}
+
+define i32 @release_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_monotonic_i32_global_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_monotonic_i32_global_gpu_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [release_monotonic_i32_global_gpu_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [release_monotonic_i32_global_gpu_param_2];
+; SM90-NEXT:    atom.release.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release monotonic
+    ret i32 %new
+}
+
+define i32 @release_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_monotonic_i32_shared_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_monotonic_i32_shared_sys_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [release_monotonic_i32_shared_sys_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [release_monotonic_i32_shared_sys_param_2];
+; SM90-NEXT:    atom.release.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release monotonic
+    ret i32 %new
+}
+
+define i32 @release_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_monotonic_i32_shared_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_monotonic_i32_shared_cta_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [release_monotonic_i32_shared_cta_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [release_monotonic_i32_shared_cta_param_2];
+; SM90-NEXT:    atom.release.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic
+    ret i32 %new
+}
+
+define i32 @release_monotonic_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_monotonic_i32_shared_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_monotonic_i32_shared_cluster_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [release_monotonic_i32_shared_cluster_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [release_monotonic_i32_shared_cluster_param_2];
+; SM90-NEXT:    atom.release.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") release monotonic
+    ret i32 %new
+}
+
+define i32 @release_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_monotonic_i32_shared_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_monotonic_i32_shared_gpu_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [release_monotonic_i32_shared_gpu_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [release_monotonic_i32_shared_gpu_param_2];
+; SM90-NEXT:    atom.release.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release monotonic
+    ret i32 %new
+}
+
+define i32 @release_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_acquire_i32_generic_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_acquire_i32_generic_sys_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [release_acquire_i32_generic_sys_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [release_acquire_i32_generic_sys_param_2];
+; SM90-NEXT:    atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release acquire
+    ret i32 %new
+}
+
+define i32 @release_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_acquire_i32_generic_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_acquire_i32_generic_cta_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [release_acquire_i32_generic_cta_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [release_acquire_i32_generic_cta_param_2];
+; SM90-NEXT:    atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release acquire
+    ret i32 %new
+}
+
+define i32 @release_acquire_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_acquire_i32_generic_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_acquire_i32_generic_cluster_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [release_acquire_i32_generic_cluster_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [release_acquire_i32_generic_cluster_param_2];
+; SM90-NEXT:    atom.acq_rel.cluster.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") release acquire
+    ret i32 %new
+}
+
+define i32 @release_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_acquire_i32_generic_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_acquire_i32_generic_gpu_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [release_acquire_i32_generic_gpu_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [release_acquire_i32_generic_gpu_param_2];
+; SM90-NEXT:    atom.acq_rel.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release acquire
+    ret i32 %new
+}
+
+define i32 @release_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_acquire_i32_global_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_acquire_i32_global_sys_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [release_acquire_i32_global_sys_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [release_acquire_i32_global_sys_param_2];
+; SM90-NEXT:    atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release acquire
+    ret i32 %new
+}
+
+define i32 @release_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_acquire_i32_global_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_acquire_i32_global_cta_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [release_acquire_i32_global_cta_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [release_acquire_i32_global_cta_param_2];
+; SM90-NEXT:    atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release acquire
+    ret i32 %new
+}
+
+define i32 @release_acquire_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_acquire_i32_global_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_acquire_i32_global_cluster_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [release_acquire_i32_global_cluster_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [release_acquire_i32_global_cluster_param_2];
+; SM90-NEXT:    atom.acq_rel.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") release acquire
+    ret i32 %new
+}
+
+define i32 @release_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_acquire_i32_global_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_acquire_i32_global_gpu_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [release_acquire_i32_global_gpu_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [release_acquire_i32_global_gpu_param_2];
+; SM90-NEXT:    atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release acquire
+    ret i32 %new
+}
+
+define i32 @release_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_acquire_i32_shared_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_acquire_i32_shared_sys_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [release_acquire_i32_shared_sys_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [release_acquire_i32_shared_sys_param_2];
+; SM90-NEXT:    atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release acquire
+    ret i32 %new
+}
+
+define i32 @release_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_acquire_i32_shared_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_acquire_i32_shared_cta_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [release_acquire_i32_shared_cta_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [release_acquire_i32_shared_cta_param_2];
+; SM90-NEXT:    atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release acquire
+    ret i32 %new
+}
+
+define i32 @release_acquire_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_acquire_i32_shared_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_acquire_i32_shared_cluster_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [release_acquire_i32_shared_cluster_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [release_acquire_i32_shared_cluster_param_2];
+; SM90-NEXT:    atom.acq_rel.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") release acquire
+    ret i32 %new
+}
+
+define i32 @release_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_acquire_i32_shared_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_acquire_i32_shared_gpu_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [release_acquire_i32_shared_gpu_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [release_acquire_i32_shared_gpu_param_2];
+; SM90-NEXT:    atom.acq_rel.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release acquire
+    ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_seq_cst_i32_generic_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i32_generic_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u32 %r1, [release_seq_cst_i32_generic_sys_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [release_seq_cst_i32_generic_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release seq_cst
+    ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_seq_cst_i32_generic_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i32_generic_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u32 %r1, [release_seq_cst_i32_generic_cta_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [release_seq_cst_i32_generic_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst
+    ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_seq_cst_i32_generic_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i32_generic_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u32 %r1, [release_seq_cst_i32_generic_cluster_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [release_seq_cst_i32_generic_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") release seq_cst
+    ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_seq_cst_i32_generic_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i32_generic_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u32 %r1, [release_seq_cst_i32_generic_gpu_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [release_seq_cst_i32_generic_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst
+    ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_seq_cst_i32_global_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i32_global_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u32 %r1, [release_seq_cst_i32_global_sys_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [release_seq_cst_i32_global_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release seq_cst
+    ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_seq_cst_i32_global_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i32_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u32 %r1, [release_seq_cst_i32_global_cta_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [release_seq_cst_i32_global_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst
+    ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_seq_cst_i32_global_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i32_global_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u32 %r1, [release_seq_cst_i32_global_cluster_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [release_seq_cst_i32_global_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") release seq_cst
+    ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_seq_cst_i32_global_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i32_global_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u32 %r1, [release_seq_cst_i32_global_gpu_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [release_seq_cst_i32_global_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst
+    ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_seq_cst_i32_shared_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i32_shared_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u32 %r1, [release_seq_cst_i32_shared_sys_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [release_seq_cst_i32_shared_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release seq_cst
+    ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_seq_cst_i32_shared_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i32_shared_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u32 %r1, [release_seq_cst_i32_shared_cta_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [release_seq_cst_i32_shared_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst
+    ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_seq_cst_i32_shared_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i32_shared_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u32 %r1, [release_seq_cst_i32_shared_cluster_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [release_seq_cst_i32_shared_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") release seq_cst
+    ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_seq_cst_i32_shared_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i32_shared_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u32 %r1, [release_seq_cst_i32_shared_gpu_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [release_seq_cst_i32_shared_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst
+    ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_monotonic_i32_generic_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_sys_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_sys_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_sys_param_2];
+; SM90-NEXT:    atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic
+    ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_monotonic_i32_generic_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_cta_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_cta_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_cta_param_2];
+; SM90-NEXT:    atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic
+    ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_monotonic_i32_generic_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_cluster_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_cluster_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_cluster_param_2];
+; SM90-NEXT:    atom.acq_rel.cluster.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel monotonic
+    ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_monotonic_i32_generic_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_gpu_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_gpu_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_gpu_param_2];
+; SM90-NEXT:    atom.acq_rel.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic
+    ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_monotonic_i32_global_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_sys_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_monotonic_i32_global_sys_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_monotonic_i32_global_sys_param_2];
+; SM90-NEXT:    atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic
+    ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_monotonic_i32_global_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_cta_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_monotonic_i32_global_cta_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_monotonic_i32_global_cta_param_2];
+; SM90-NEXT:    atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic
+    ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_monotonic_i32_global_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_cluster_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_monotonic_i32_global_cluster_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_monotonic_i32_global_cluster_param_2];
+; SM90-NEXT:    atom.acq_rel.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel monotonic
+    ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_monotonic_i32_global_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_gpu_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_monotonic_i32_global_gpu_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_monotonic_i32_global_gpu_param_2];
+; SM90-NEXT:    atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic
+    ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_monotonic_i32_shared_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_sys_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_sys_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_sys_param_2];
+; SM90-NEXT:    atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic
+    ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_monotonic_i32_shared_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_cta_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_cta_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_cta_param_2];
+; SM90-NEXT:    atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic
+    ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_monotonic_i32_shared_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_cluster_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_cluster_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_cluster_param_2];
+; SM90-NEXT:    atom.acq_rel.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel monotonic
+    ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_monotonic_i32_shared_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_gpu_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_gpu_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_gpu_param_2];
+; SM90-NEXT:    atom.acq_rel.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic
+    ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_acquire_i32_generic_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_sys_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_acquire_i32_generic_sys_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_acquire_i32_generic_sys_param_2];
+; SM90-NEXT:    atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire
+    ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_acquire_i32_generic_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_cta_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_acquire_i32_generic_cta_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_acquire_i32_generic_cta_param_2];
+; SM90-NEXT:    atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
+    ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_acquire_i32_generic_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_cluster_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_acquire_i32_generic_cluster_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_acquire_i32_generic_cluster_param_2];
+; SM90-NEXT:    atom.acq_rel.cluster.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel acquire
+    ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_acquire_i32_generic_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_gpu_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_acquire_i32_generic_gpu_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_acquire_i32_generic_gpu_param_2];
+; SM90-NEXT:    atom.acq_rel.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire
+    ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_acquire_i32_global_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i32_global_sys_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_acquire_i32_global_sys_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_acquire_i32_global_sys_param_2];
+; SM90-NEXT:    atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire
+    ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_acquire_i32_global_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i32_global_cta_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_acquire_i32_global_cta_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_acquire_i32_global_cta_param_2];
+; SM90-NEXT:    atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
+    ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_acquire_i32_global_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i32_global_cluster_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_acquire_i32_global_cluster_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_acquire_i32_global_cluster_param_2];
+; SM90-NEXT:    atom.acq_rel.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel acquire
+    ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_acquire_i32_global_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i32_global_gpu_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_acquire_i32_global_gpu_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_acquire_i32_global_gpu_param_2];
+; SM90-NEXT:    atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire
+    ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_acquire_i32_shared_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_sys_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_acquire_i32_shared_sys_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_acquire_i32_shared_sys_param_2];
+; SM90-NEXT:    atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire
+    ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_acquire_i32_shared_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_cta_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_acquire_i32_shared_cta_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_acquire_i32_shared_cta_param_2];
+; SM90-NEXT:    atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
+    ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_acquire_i32_shared_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_cluster_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_acquire_i32_shared_cluster_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_acquire_i32_shared_cluster_param_2];
+; SM90-NEXT:    atom.acq_rel.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel acquire
+    ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_acquire_i32_shared_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_gpu_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_acquire_i32_shared_gpu_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_acquire_i32_shared_gpu_param_2];
+; SM90-NEXT:    atom.acq_rel.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire
+    ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i32_generic_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_sys_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst
+    ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i32_generic_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_cta_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst
+    ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i32_generic_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_cluster_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel seq_cst
+    ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i32_generic_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_gpu_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst
+    ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i32_global_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_sys_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst
+    ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i32_global_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_cta_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst
+    ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i32_global_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_cluster_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel seq_cst
+    ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i32_global_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_gpu_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst
+    ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i32_shared_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_sys_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst
+    ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i32_shared_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_cta_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst
+    ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i32_shared_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_cluster_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel seq_cst
+    ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i32_shared_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_gpu_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst
+    ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_monotonic_i32_generic_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_sys_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic
+    ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_monotonic_i32_generic_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_cta_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic
+    ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_monotonic_i32_generic_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_cluster_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst monotonic
+    ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_monotonic_i32_generic_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_gpu_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic
+    ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_monotonic_i32_global_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_monotonic_i32_global_sys_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_monotonic_i32_global_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic
+    ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_monotonic_i32_global_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_monotonic_i32_global_cta_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_monotonic_i32_global_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic
+    ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_monotonic_i32_global_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_monotonic_i32_global_cluster_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_monotonic_i32_global_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst monotonic
+    ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_monotonic_i32_global_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_monotonic_i32_global_gpu_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_monotonic_i32_global_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic
+    ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_monotonic_i32_shared_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_sys_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic
+    ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_monotonic_i32_shared_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_cta_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic
+    ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_monotonic_i32_shared_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_cluster_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst monotonic
+    ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_monotonic_i32_shared_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_gpu_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic
+    ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_acquire_i32_generic_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_acquire_i32_generic_sys_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_acquire_i32_generic_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire
+    ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_acquire_i32_generic_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_acquire_i32_generic_cta_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_acquire_i32_generic_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire
+    ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_acquire_i32_generic_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_acquire_i32_generic_cluster_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_acquire_i32_generic_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst acquire
+    ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_acquire_i32_generic_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_acquire_i32_generic_gpu_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_acquire_i32_generic_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire
+    ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_acquire_i32_global_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i32_global_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_acquire_i32_global_sys_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_acquire_i32_global_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire
+    ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_acquire_i32_global_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i32_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_acquire_i32_global_cta_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_acquire_i32_global_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire
+    ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_acquire_i32_global_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i32_global_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_acquire_i32_global_cluster_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_acquire_i32_global_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst acquire
+    ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_acquire_i32_global_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i32_global_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_acquire_i32_global_gpu_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_acquire_i32_global_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire
+    ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_acquire_i32_shared_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_acquire_i32_shared_sys_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_acquire_i32_shared_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire
+    ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_acquire_i32_shared_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_acquire_i32_shared_cta_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_acquire_i32_shared_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire
+    ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_acquire_i32_shared_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_acquire_i32_shared_cluster_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_acquire_i32_shared_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst acquire
+    ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_acquire_i32_shared_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_acquire_i32_shared_gpu_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_acquire_i32_shared_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire
+    ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i32_generic_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_sys_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst
+    ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i32_generic_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_cta_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst
+    ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i32_generic_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_cluster_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst seq_cst
+    ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i32_generic_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_gpu_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst
+    ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i32_global_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_sys_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst
+    ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i32_global_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_cta_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst
+    ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i32_global_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_cluster_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst seq_cst
+    ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i32_global_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_gpu_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst
+    ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i32_shared_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_sys_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst
+    ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i32_shared_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_cta_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst
+    ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i32_shared_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_cluster_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst seq_cst
+    ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i32_shared_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_gpu_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst
+    ret i32 %new
+}
+
+define i64 @monotonic_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_monotonic_i64_generic_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_sys_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_sys_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_sys_param_2];
+; SM90-NEXT:    atom.relaxed.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic
+    ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_monotonic_i64_generic_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_cta_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_cta_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_cta_param_2];
+; SM90-NEXT:    atom.relaxed.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic
+    ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_monotonic_i64_generic_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_cluster_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_cluster_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_cluster_param_2];
+; SM90-NEXT:    atom.relaxed.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic monotonic
+    ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_monotonic_i64_generic_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_gpu_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_gpu_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_gpu_param_2];
+; SM90-NEXT:    atom.relaxed.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic
+    ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_monotonic_i64_global_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i64_global_sys_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i64_global_sys_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_monotonic_i64_global_sys_param_2];
+; SM90-NEXT:    atom.relaxed.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic
+    ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_monotonic_i64_global_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i64_global_cta_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i64_global_cta_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_monotonic_i64_global_cta_param_2];
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic
+    ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_monotonic_i64_global_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i64_global_cluster_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i64_global_cluster_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_monotonic_i64_global_cluster_param_2];
+; SM90-NEXT:    atom.relaxed.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic monotonic
+    ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_monotonic_i64_global_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i64_global_gpu_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i64_global_gpu_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_monotonic_i64_global_gpu_param_2];
+; SM90-NEXT:    atom.relaxed.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic
+    ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_monotonic_i64_shared_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_sys_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_sys_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_sys_param_2];
+; SM90-NEXT:    atom.relaxed.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic
+    ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_monotonic_i64_shared_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_cta_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_cta_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_cta_param_2];
+; SM90-NEXT:    atom.relaxed.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic
+    ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_monotonic_i64_shared_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_cluster_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_cluster_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_cluster_param_2];
+; SM90-NEXT:    atom.relaxed.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic monotonic
+    ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_monotonic_i64_shared_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_gpu_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_gpu_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_gpu_param_2];
+; SM90-NEXT:    atom.relaxed.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic
+    ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_acquire_i64_generic_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i64_generic_sys_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i64_generic_sys_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_acquire_i64_generic_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire
+    ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_acquire_i64_generic_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i64_generic_cta_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i64_generic_cta_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_acquire_i64_generic_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire
+    ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_acquire_i64_generic_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i64_generic_cluster_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i64_generic_cluster_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_acquire_i64_generic_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic acquire
+    ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_acquire_i64_generic_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i64_generic_gpu_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i64_generic_gpu_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_acquire_i64_generic_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire
+    ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_acquire_i64_global_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i64_global_sys_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i64_global_sys_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_acquire_i64_global_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire
+    ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_acquire_i64_global_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i64_global_cta_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i64_global_cta_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_acquire_i64_global_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire
+    ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_acquire_i64_global_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i64_global_cluster_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i64_global_cluster_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_acquire_i64_global_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic acquire
+    ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_acquire_i64_global_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i64_global_gpu_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i64_global_gpu_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_acquire_i64_global_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire
+    ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_acquire_i64_shared_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i64_shared_sys_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i64_shared_sys_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_acquire_i64_shared_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire
+    ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_acquire_i64_shared_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i64_shared_cta_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i64_shared_cta_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_acquire_i64_shared_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire
+    ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_acquire_i64_shared_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i64_shared_cluster_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i64_shared_cluster_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_acquire_i64_shared_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic acquire
+    ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_acquire_i64_shared_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i64_shared_gpu_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i64_shared_gpu_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_acquire_i64_shared_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire
+    ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_seq_cst_i64_generic_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_sys_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst
+    ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_seq_cst_i64_generic_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_cta_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst
+    ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_seq_cst_i64_generic_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_cluster_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic seq_cst
+    ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_seq_cst_i64_generic_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_gpu_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst
+    ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_seq_cst_i64_global_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_sys_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst
+    ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_seq_cst_i64_global_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_cta_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst
+    ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_seq_cst_i64_global_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_cluster_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic seq_cst
+    ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_seq_cst_i64_global_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_gpu_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst
+    ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_seq_cst_i64_shared_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_sys_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst
+    ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_seq_cst_i64_shared_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_cta_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst
+    ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_seq_cst_i64_shared_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_cluster_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic seq_cst
+    ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_seq_cst_i64_shared_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_gpu_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst
+    ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_monotonic_i64_generic_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i64_generic_sys_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i64_generic_sys_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acquire_monotonic_i64_generic_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic
+    ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_monotonic_i64_generic_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i64_generic_cta_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i64_generic_cta_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acquire_monotonic_i64_generic_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic
+    ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_monotonic_i64_generic_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i64_generic_cluster_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i64_generic_cluster_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acquire_monotonic_i64_generic_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") acquire monotonic
+    ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_monotonic_i64_generic_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i64_generic_gpu_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i64_generic_gpu_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acquire_monotonic_i64_generic_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic
+    ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_monotonic_i64_global_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i64_global_sys_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i64_global_sys_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acquire_monotonic_i64_global_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic
+    ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_monotonic_i64_global_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i64_global_cta_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i64_global_cta_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acquire_monotonic_i64_global_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic
+    ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_monotonic_i64_global_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i64_global_cluster_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i64_global_cluster_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acquire_monotonic_i64_global_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") acquire monotonic
+    ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_monotonic_i64_global_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i64_global_gpu_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i64_global_gpu_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acquire_monotonic_i64_global_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic
+    ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_monotonic_i64_shared_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i64_shared_sys_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i64_shared_sys_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acquire_monotonic_i64_shared_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic
+    ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_monotonic_i64_shared_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i64_shared_cta_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i64_shared_cta_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acquire_monotonic_i64_shared_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic
+    ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_monotonic_i64_shared_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i64_shared_cluster_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i64_shared_cluster_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acquire_monotonic_i64_shared_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") acquire monotonic
+    ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_monotonic_i64_shared_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i64_shared_gpu_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i64_shared_gpu_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acquire_monotonic_i64_shared_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic
+    ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_acquire_i64_generic_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i64_generic_sys_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i64_generic_sys_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acquire_acquire_i64_generic_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire acquire
+    ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_acquire_i64_generic_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i64_generic_cta_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i64_generic_cta_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acquire_acquire_i64_generic_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire
+    ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_acquire_i64_generic_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i64_generic_cluster_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i64_generic_cluster_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acquire_acquire_i64_generic_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") acquire acquire
+    ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_acquire_i64_generic_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i64_generic_gpu_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i64_generic_gpu_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acquire_acquire_i64_generic_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire
+    ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_acquire_i64_global_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i64_global_sys_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i64_global_sys_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acquire_acquire_i64_global_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire acquire
+    ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_acquire_i64_global_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i64_global_cta_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i64_global_cta_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acquire_acquire_i64_global_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire
+    ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_acquire_i64_global_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i64_global_cluster_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i64_global_cluster_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acquire_acquire_i64_global_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") acquire acquire
+    ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_acquire_i64_global_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i64_global_gpu_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i64_global_gpu_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acquire_acquire_i64_global_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire
+    ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_acquire_i64_shared_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i64_shared_sys_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i64_shared_sys_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acquire_acquire_i64_shared_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire acquire
+    ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_acquire_i64_shared_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i64_shared_cta_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i64_shared_cta_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acquire_acquire_i64_shared_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire
+    ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_acquire_i64_shared_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i64_shared_cluster_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i64_shared_cluster_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acquire_acquire_i64_shared_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") acquire acquire
+    ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_acquire_i64_shared_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i64_shared_gpu_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i64_shared_gpu_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acquire_acquire_i64_shared_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire
+    ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_seq_cst_i64_generic_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_sys_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst
+    ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_seq_cst_i64_generic_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_cta_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst
+    ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_seq_cst_i64_generic_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_cluster_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") acquire seq_cst
+    ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_seq_cst_i64_generic_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_gpu_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst
+    ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_seq_cst_i64_global_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i64_global_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i64_global_sys_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acquire_seq_cst_i64_global_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst
+    ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_seq_cst_i64_global_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i64_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i64_global_cta_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acquire_seq_cst_i64_global_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst
+    ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_seq_cst_i64_global_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i64_global_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i64_global_cluster_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acquire_seq_cst_i64_global_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") acquire seq_cst
+    ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_seq_cst_i64_global_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i64_global_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i64_global_gpu_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acquire_seq_cst_i64_global_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst
+    ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_seq_cst_i64_shared_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_sys_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst
+    ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_seq_cst_i64_shared_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_cta_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst
+    ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_seq_cst_i64_shared_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_cluster_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") acquire seq_cst
+    ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_seq_cst_i64_shared_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_gpu_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst
+    ret i64 %new
+}
+
+define i64 @release_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_monotonic_i64_generic_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_monotonic_i64_generic_sys_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i64_generic_sys_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [release_monotonic_i64_generic_sys_param_2];
+; SM90-NEXT:    atom.release.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release monotonic
+    ret i64 %new
+}
+
+define i64 @release_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_monotonic_i64_generic_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_monotonic_i64_generic_cta_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i64_generic_cta_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [release_monotonic_i64_generic_cta_param_2];
+; SM90-NEXT:    atom.release.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release monotonic
+    ret i64 %new
 }
 
-define i32 @monotonic_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: monotonic_monotonic_i32_shared(
+define i64 @release_monotonic_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_monotonic_i64_generic_cluster(
 ; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_shared_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_shared_param_2];
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ld.param.u64 %rd1, [release_monotonic_i64_generic_cluster_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i64_generic_cluster_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [release_monotonic_i64_generic_cluster_param_2];
+; SM90-NEXT:    atom.release.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic monotonic
-    ret i32 %new
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") release monotonic
+    ret i64 %new
 }
 
-define i32 @monotonic_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: monotonic_acquire_i32_generic(
+define i64 @release_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_monotonic_i64_generic_gpu(
 ; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_generic_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_generic_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_generic_param_2];
-; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ld.param.u64 %rd1, [release_monotonic_i64_generic_gpu_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i64_generic_gpu_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [release_monotonic_i64_generic_gpu_param_2];
+; SM90-NEXT:    atom.release.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic acquire
-    ret i32 %new
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release monotonic
+    ret i64 %new
 }
 
-define i32 @monotonic_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: monotonic_acquire_i32_global(
+define i64 @release_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_monotonic_i64_global_sys(
 ; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_global_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_global_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_global_param_2];
-; SM90-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ld.param.u64 %rd1, [release_monotonic_i64_global_sys_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i64_global_sys_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [release_monotonic_i64_global_sys_param_2];
+; SM90-NEXT:    atom.release.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic acquire
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release monotonic
+    ret i64 %new
 }
 
-define i32 @monotonic_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: monotonic_acquire_i32_shared(
+define i64 @release_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_monotonic_i64_global_cta(
 ; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_shared_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_shared_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_shared_param_2];
-; SM90-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ld.param.u64 %rd1, [release_monotonic_i64_global_cta_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i64_global_cta_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [release_monotonic_i64_global_cta_param_2];
+; SM90-NEXT:    atom.release.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic acquire
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic
+    ret i64 %new
 }
 
-define i32 @monotonic_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: monotonic_seq_cst_i32_generic(
+define i64 @release_monotonic_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_monotonic_i64_global_cluster(
 ; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_param_2];
-; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ld.param.u64 %rd1, [release_monotonic_i64_global_cluster_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i64_global_cluster_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [release_monotonic_i64_global_cluster_param_2];
+; SM90-NEXT:    atom.release.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic seq_cst
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") release monotonic
+    ret i64 %new
 }
 
-define i32 @monotonic_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: monotonic_seq_cst_i32_global(
+define i64 @release_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_monotonic_i64_global_gpu(
 ; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_global_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_global_param_2];
-; SM90-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ld.param.u64 %rd1, [release_monotonic_i64_global_gpu_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i64_global_gpu_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [release_monotonic_i64_global_gpu_param_2];
+; SM90-NEXT:    atom.release.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic seq_cst
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release monotonic
+    ret i64 %new
 }
 
-define i32 @monotonic_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: monotonic_seq_cst_i32_shared(
+define i64 @release_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_monotonic_i64_shared_sys(
 ; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_monotonic_i64_shared_sys_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i64_shared_sys_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [release_monotonic_i64_shared_sys_param_2];
+; SM90-NEXT:    atom.release.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release monotonic
+    ret i64 %new
+}
+
+define i64 @release_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_monotonic_i64_shared_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_monotonic_i64_shared_cta_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i64_shared_cta_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [release_monotonic_i64_shared_cta_param_2];
+; SM90-NEXT:    atom.release.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic
+    ret i64 %new
+}
+
+define i64 @release_monotonic_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_monotonic_i64_shared_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_monotonic_i64_shared_cluster_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i64_shared_cluster_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [release_monotonic_i64_shared_cluster_param_2];
+; SM90-NEXT:    atom.release.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") release monotonic
+    ret i64 %new
+}
+
+define i64 @release_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_monotonic_i64_shared_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_monotonic_i64_shared_gpu_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i64_shared_gpu_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [release_monotonic_i64_shared_gpu_param_2];
+; SM90-NEXT:    atom.release.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release monotonic
+    ret i64 %new
+}
+
+define i64 @release_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_acquire_i64_generic_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_acquire_i64_generic_sys_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i64_generic_sys_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [release_acquire_i64_generic_sys_param_2];
+; SM90-NEXT:    atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release acquire
+    ret i64 %new
+}
+
+define i64 @release_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_acquire_i64_generic_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_acquire_i64_generic_cta_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i64_generic_cta_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [release_acquire_i64_generic_cta_param_2];
+; SM90-NEXT:    atom.acq_rel.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release acquire
+    ret i64 %new
+}
+
+define i64 @release_acquire_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_acquire_i64_generic_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_acquire_i64_generic_cluster_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i64_generic_cluster_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [release_acquire_i64_generic_cluster_param_2];
+; SM90-NEXT:    atom.acq_rel.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") release acquire
+    ret i64 %new
+}
+
+define i64 @release_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_acquire_i64_generic_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_acquire_i64_generic_gpu_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i64_generic_gpu_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [release_acquire_i64_generic_gpu_param_2];
+; SM90-NEXT:    atom.acq_rel.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release acquire
+    ret i64 %new
+}
+
+define i64 @release_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_acquire_i64_global_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_acquire_i64_global_sys_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i64_global_sys_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [release_acquire_i64_global_sys_param_2];
+; SM90-NEXT:    atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release acquire
+    ret i64 %new
+}
+
+define i64 @release_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_acquire_i64_global_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_acquire_i64_global_cta_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i64_global_cta_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [release_acquire_i64_global_cta_param_2];
+; SM90-NEXT:    atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release acquire
+    ret i64 %new
+}
+
+define i64 @release_acquire_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_acquire_i64_global_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_acquire_i64_global_cluster_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i64_global_cluster_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [release_acquire_i64_global_cluster_param_2];
+; SM90-NEXT:    atom.acq_rel.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") release acquire
+    ret i64 %new
+}
+
+define i64 @release_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_acquire_i64_global_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_acquire_i64_global_gpu_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i64_global_gpu_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [release_acquire_i64_global_gpu_param_2];
+; SM90-NEXT:    atom.acq_rel.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release acquire
+    ret i64 %new
+}
+
+define i64 @release_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_acquire_i64_shared_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_acquire_i64_shared_sys_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i64_shared_sys_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [release_acquire_i64_shared_sys_param_2];
+; SM90-NEXT:    atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release acquire
+    ret i64 %new
+}
+
+define i64 @release_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_acquire_i64_shared_cta(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_acquire_i64_shared_cta_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i64_shared_cta_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [release_acquire_i64_shared_cta_param_2];
+; SM90-NEXT:    atom.acq_rel.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release acquire
+    ret i64 %new
+}
+
+define i64 @release_acquire_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_acquire_i64_shared_cluster(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_acquire_i64_shared_cluster_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i64_shared_cluster_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [release_acquire_i64_shared_cluster_param_2];
+; SM90-NEXT:    atom.acq_rel.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") release acquire
+    ret i64 %new
+}
+
+define i64 @release_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_acquire_i64_shared_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_acquire_i64_shared_gpu_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i64_shared_gpu_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [release_acquire_i64_shared_gpu_param_2];
+; SM90-NEXT:    atom.acq_rel.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release acquire
+    ret i64 %new
+}
+
+define i64 @release_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_seq_cst_i64_generic_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_param_0];
+; SM90-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i64_generic_sys_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_param_2];
-; SM90-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i64_generic_sys_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [release_seq_cst_i64_generic_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic seq_cst
-    ret i32 %new
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release seq_cst
+    ret i64 %new
 }
 
-define i32 @acquire_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acquire_monotonic_i32_generic(
+define i64 @release_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_seq_cst_i64_generic_cta(
 ; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_generic_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_generic_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_generic_param_2];
-; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i64_generic_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i64_generic_cta_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [release_seq_cst_i64_generic_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire monotonic
-    ret i32 %new
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst
+    ret i64 %new
 }
 
-define i32 @acquire_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acquire_monotonic_i32_global(
+define i64 @release_seq_cst_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_seq_cst_i64_generic_cluster(
 ; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_global_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_global_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_global_param_2];
-; SM90-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i64_generic_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i64_generic_cluster_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [release_seq_cst_i64_generic_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire monotonic
-    ret i32 %new
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") release seq_cst
+    ret i64 %new
 }
 
-define i32 @acquire_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acquire_monotonic_i32_shared(
+define i64 @release_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_seq_cst_i64_generic_gpu(
 ; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_shared_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_shared_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_shared_param_2];
-; SM90-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i64_generic_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i64_generic_gpu_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [release_seq_cst_i64_generic_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire monotonic
-    ret i32 %new
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst
+    ret i64 %new
 }
 
-define i32 @acquire_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acquire_acquire_i32_generic(
+define i64 @release_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_seq_cst_i64_global_sys(
 ; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_generic_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_generic_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_generic_param_2];
-; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i64_global_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i64_global_sys_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [release_seq_cst_i64_global_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire acquire
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release seq_cst
+    ret i64 %new
 }
 
-define i32 @acquire_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acquire_acquire_i32_global(
+define i64 @release_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_seq_cst_i64_global_cta(
 ; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_global_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_global_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_global_param_2];
-; SM90-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i64_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i64_global_cta_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [release_seq_cst_i64_global_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire acquire
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst
+    ret i64 %new
 }
 
-define i32 @acquire_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acquire_acquire_i32_shared(
+define i64 @release_seq_cst_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_seq_cst_i64_global_cluster(
 ; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i64_global_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i64_global_cluster_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [release_seq_cst_i64_global_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") release seq_cst
+    ret i64 %new
+}
+
+define i64 @release_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_seq_cst_i64_global_gpu(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i64_global_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i64_global_gpu_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [release_seq_cst_i64_global_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst
+    ret i64 %new
+}
+
+define i64 @release_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_seq_cst_i64_shared_sys(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_shared_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_shared_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_shared_param_2];
-; SM90-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i64_shared_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i64_shared_sys_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [release_seq_cst_i64_shared_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire acquire
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release seq_cst
+    ret i64 %new
 }
 
-define i32 @acquire_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acquire_seq_cst_i32_generic(
+define i64 @release_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_seq_cst_i64_shared_cta(
 ; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_generic_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_generic_param_2];
-; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i64_shared_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i64_shared_cta_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [release_seq_cst_i64_shared_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire seq_cst
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst
+    ret i64 %new
 }
 
-define i32 @acquire_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acquire_seq_cst_i32_global(
+define i64 @release_seq_cst_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_seq_cst_i64_shared_cluster(
 ; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_global_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_global_param_2];
-; SM90-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i64_shared_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i64_shared_cluster_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [release_seq_cst_i64_shared_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire seq_cst
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") release seq_cst
+    ret i64 %new
 }
 
-define i32 @acquire_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acquire_seq_cst_i32_shared(
+define i64 @release_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_seq_cst_i64_shared_gpu(
 ; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_shared_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_shared_param_2];
-; SM90-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i64_shared_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i64_shared_gpu_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [release_seq_cst_i64_shared_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire seq_cst
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst
+    ret i64 %new
 }
 
-define i32 @release_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: release_monotonic_i32_generic(
+define i64 @acq_rel_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_monotonic_i64_generic_sys(
 ; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_generic_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_generic_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_generic_param_2];
-; SM90-NEXT:    atom.release.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_sys_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_sys_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_sys_param_2];
+; SM90-NEXT:    atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release monotonic
-    ret i32 %new
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic
+    ret i64 %new
 }
 
-define i32 @release_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: release_monotonic_i32_global(
+define i64 @acq_rel_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_monotonic_i64_generic_cta(
 ; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_global_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_global_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_global_param_2];
-; SM90-NEXT:    atom.release.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_cta_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_cta_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_cta_param_2];
+; SM90-NEXT:    atom.acq_rel.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release monotonic
-    ret i32 %new
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic
+    ret i64 %new
 }
 
-define i32 @release_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: release_monotonic_i32_shared(
+define i64 @acq_rel_monotonic_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_monotonic_i64_generic_cluster(
 ; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_shared_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_shared_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_shared_param_2];
-; SM90-NEXT:    atom.release.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_cluster_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_cluster_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_cluster_param_2];
+; SM90-NEXT:    atom.acq_rel.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release monotonic
-    ret i32 %new
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel monotonic
+    ret i64 %new
 }
 
-define i32 @release_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: release_acquire_i32_generic(
+define i64 @acq_rel_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_monotonic_i64_generic_gpu(
 ; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_generic_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [release_acquire_i32_generic_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [release_acquire_i32_generic_param_2];
-; SM90-NEXT:    atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_gpu_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_gpu_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_gpu_param_2];
+; SM90-NEXT:    atom.acq_rel.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release acquire
-    ret i32 %new
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic
+    ret i64 %new
 }
 
-define i32 @release_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: release_acquire_i32_global(
+define i64 @acq_rel_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_monotonic_i64_global_sys(
 ; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_global_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [release_acquire_i32_global_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [release_acquire_i32_global_param_2];
-; SM90-NEXT:    atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_sys_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_sys_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_sys_param_2];
+; SM90-NEXT:    atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release acquire
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic
+    ret i64 %new
 }
 
-define i32 @release_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: release_acquire_i32_shared(
+define i64 @acq_rel_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_monotonic_i64_global_cta(
 ; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_shared_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [release_acquire_i32_shared_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [release_acquire_i32_shared_param_2];
-; SM90-NEXT:    atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_cta_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_cta_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_cta_param_2];
+; SM90-NEXT:    atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release acquire
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic
+    ret i64 %new
 }
 
-define i32 @release_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: release_seq_cst_i32_generic(
+define i64 @acq_rel_monotonic_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_monotonic_i64_global_cluster(
 ; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_generic_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_generic_param_2];
-; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_cluster_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_cluster_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_cluster_param_2];
+; SM90-NEXT:    atom.acq_rel.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release seq_cst
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel monotonic
+    ret i64 %new
 }
 
-define i32 @release_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: release_seq_cst_i32_global(
+define i64 @acq_rel_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_monotonic_i64_global_gpu(
 ; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_global_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_global_param_2];
-; SM90-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_gpu_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_gpu_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_gpu_param_2];
+; SM90-NEXT:    atom.acq_rel.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release seq_cst
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic
+    ret i64 %new
 }
 
-define i32 @release_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: release_seq_cst_i32_shared(
+define i64 @acq_rel_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_monotonic_i64_shared_sys(
 ; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_shared_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_shared_param_2];
-; SM90-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_sys_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_sys_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_sys_param_2];
+; SM90-NEXT:    atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release seq_cst
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic
+    ret i64 %new
 }
 
-define i32 @acq_rel_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acq_rel_monotonic_i32_generic(
+define i64 @acq_rel_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_monotonic_i64_shared_cta(
 ; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_param_2];
-; SM90-NEXT:    atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_cta_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_cta_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_cta_param_2];
+; SM90-NEXT:    atom.acq_rel.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel monotonic
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic
+    ret i64 %new
 }
 
-define i32 @acq_rel_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acq_rel_monotonic_i32_global(
+define i64 @acq_rel_monotonic_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_monotonic_i64_shared_cluster(
 ; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_global_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_global_param_2];
-; SM90-NEXT:    atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_cluster_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_cluster_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_cluster_param_2];
+; SM90-NEXT:    atom.acq_rel.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel monotonic
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel monotonic
+    ret i64 %new
 }
 
-define i32 @acq_rel_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acq_rel_monotonic_i32_shared(
+define i64 @acq_rel_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_monotonic_i64_shared_gpu(
 ; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_param_2];
-; SM90-NEXT:    atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_gpu_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_gpu_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_gpu_param_2];
+; SM90-NEXT:    atom.acq_rel.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel monotonic
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic
+    ret i64 %new
 }
 
-define i32 @acq_rel_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acq_rel_acquire_i32_generic(
+define i64 @acq_rel_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_acquire_i64_generic_sys(
 ; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_generic_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_generic_param_2];
-; SM90-NEXT:    atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_sys_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_sys_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_sys_param_2];
+; SM90-NEXT:    atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel acquire
-    ret i32 %new
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire
+    ret i64 %new
 }
 
-define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acq_rel_acquire_i32_global(
+define i64 @acq_rel_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_acquire_i64_generic_cta(
 ; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_global_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_global_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_global_param_2];
-; SM90-NEXT:    atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_cta_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_cta_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_cta_param_2];
+; SM90-NEXT:    atom.acq_rel.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire
-    ret i32 %new
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire
+    ret i64 %new
 }
 
-define i32 @acq_rel_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acq_rel_acquire_i32_shared(
+define i64 @acq_rel_acquire_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_acquire_i64_generic_cluster(
 ; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_shared_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_shared_param_2];
-; SM90-NEXT:    atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_cluster_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_cluster_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_cluster_param_2];
+; SM90-NEXT:    atom.acq_rel.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel acquire
-    ret i32 %new
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel acquire
+    ret i64 %new
 }
 
-define i32 @acq_rel_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acq_rel_seq_cst_i32_generic(
+define i64 @acq_rel_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_acquire_i64_generic_gpu(
 ; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_param_2];
-; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_gpu_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_gpu_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_gpu_param_2];
+; SM90-NEXT:    atom.acq_rel.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel seq_cst
-    ret i32 %new
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire
+    ret i64 %new
 }
 
-define i32 @acq_rel_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acq_rel_seq_cst_i32_global(
+define i64 @acq_rel_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_acquire_i64_global_sys(
 ; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_param_2];
-; SM90-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i64_global_sys_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i64_global_sys_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_acquire_i64_global_sys_param_2];
+; SM90-NEXT:    atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel seq_cst
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire
+    ret i64 %new
 }
 
-define i32 @acq_rel_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acq_rel_seq_cst_i32_shared(
+define i64 @acq_rel_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_acquire_i64_global_cta(
 ; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_param_2];
-; SM90-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i64_global_cta_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i64_global_cta_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_acquire_i64_global_cta_param_2];
+; SM90-NEXT:    atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel seq_cst
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire
+    ret i64 %new
 }
 
-define i32 @seq_cst_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: seq_cst_monotonic_i32_generic(
+define i64 @acq_rel_acquire_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_acquire_i64_global_cluster(
 ; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_param_2];
-; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i64_global_cluster_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i64_global_cluster_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_acquire_i64_global_cluster_param_2];
+; SM90-NEXT:    atom.acq_rel.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst monotonic
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel acquire
+    ret i64 %new
 }
 
-define i32 @seq_cst_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: seq_cst_monotonic_i32_global(
+define i64 @acq_rel_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_acquire_i64_global_gpu(
 ; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_global_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_global_param_2];
-; SM90-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i64_global_gpu_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i64_global_gpu_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_acquire_i64_global_gpu_param_2];
+; SM90-NEXT:    atom.acq_rel.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst monotonic
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire
+    ret i64 %new
 }
 
-define i32 @seq_cst_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: seq_cst_monotonic_i32_shared(
+define i64 @acq_rel_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_acquire_i64_shared_sys(
 ; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_param_2];
-; SM90-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_sys_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_sys_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_sys_param_2];
+; SM90-NEXT:    atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst monotonic
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire
+    ret i64 %new
 }
 
-define i32 @seq_cst_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: seq_cst_acquire_i32_generic(
+define i64 @acq_rel_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_acquire_i64_shared_cta(
 ; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_generic_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_generic_param_2];
-; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_cta_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_cta_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_cta_param_2];
+; SM90-NEXT:    atom.acq_rel.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst acquire
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire
+    ret i64 %new
 }
 
-define i32 @seq_cst_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: seq_cst_acquire_i32_global(
+define i64 @acq_rel_acquire_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_acquire_i64_shared_cluster(
 ; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_global_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_global_param_2];
-; SM90-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_cluster_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_cluster_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_cluster_param_2];
+; SM90-NEXT:    atom.acq_rel.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst acquire
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel acquire
+    ret i64 %new
 }
 
-define i32 @seq_cst_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: seq_cst_acquire_i32_shared(
+define i64 @acq_rel_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_acquire_i64_shared_gpu(
 ; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_shared_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_shared_param_2];
-; SM90-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_gpu_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_gpu_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_gpu_param_2];
+; SM90-NEXT:    atom.acq_rel.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst acquire
-    ret i32 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire
+    ret i64 %new
 }
 
-define i32 @seq_cst_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: seq_cst_seq_cst_i32_generic(
+define i64 @acq_rel_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i64_generic_sys(
 ; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_param_0];
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_sys_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_param_2];
-; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_sys_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst
-    ret i32 %new
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst
+    ret i64 %new
 }
 
-define i32 @seq_cst_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: seq_cst_seq_cst_i32_global(
+define i64 @acq_rel_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i64_generic_cta(
 ; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_param_2];
-; SM90-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_cta_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst seq_cst
-    ret i32 %new
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst
+    ret i64 %new
 }
 
-define i32 @seq_cst_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: seq_cst_seq_cst_i32_shared(
+define i64 @acq_rel_seq_cst_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i64_generic_cluster(
 ; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_param_2];
-; SM90-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_cluster_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst seq_cst
-    ret i32 %new
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel seq_cst
+    ret i64 %new
 }
 
-define i64 @monotonic_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: monotonic_monotonic_i64_generic(
+define i64 @acq_rel_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i64_generic_gpu(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_param_2];
-; SM90-NEXT:    atom.relaxed.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_gpu_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst
     ret i64 %new
 }
 
-define i64 @monotonic_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: monotonic_monotonic_i64_global(
+define i64 @acq_rel_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i64_global_sys(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_global_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_global_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_global_param_2];
-; SM90-NEXT:    atom.relaxed.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_sys_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic monotonic
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst
     ret i64 %new
 }
 
-define i64 @monotonic_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: monotonic_monotonic_i64_shared(
+define i64 @acq_rel_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i64_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_param_2];
-; SM90-NEXT:    atom.relaxed.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_cta_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic monotonic
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst
     ret i64 %new
 }
 
-define i64 @monotonic_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: monotonic_acquire_i64_generic(
+define i64 @acq_rel_seq_cst_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i64_global_cluster(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_generic_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_generic_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_generic_param_2];
-; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_cluster_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel seq_cst
     ret i64 %new
 }
 
-define i64 @monotonic_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: monotonic_acquire_i64_global(
+define i64 @acq_rel_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i64_global_gpu(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_global_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_global_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_global_param_2];
-; SM90-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_gpu_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst
     ret i64 %new
 }
 
-define i64 @monotonic_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: monotonic_acquire_i64_shared(
+define i64 @acq_rel_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i64_shared_sys(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_shared_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_shared_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_shared_param_2];
-; SM90-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_sys_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic acquire
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst
     ret i64 %new
 }
 
-define i64 @monotonic_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: monotonic_seq_cst_i64_generic(
+define i64 @acq_rel_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i64_shared_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_param_2];
-; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_cta_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst
     ret i64 %new
 }
 
-define i64 @monotonic_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: monotonic_seq_cst_i64_global(
+define i64 @acq_rel_seq_cst_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i64_shared_cluster(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_param_2];
-; SM90-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_cluster_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel seq_cst
     ret i64 %new
 }
 
-define i64 @monotonic_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: monotonic_seq_cst_i64_shared(
+define i64 @acq_rel_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i64_shared_gpu(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_param_2];
-; SM90-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_gpu_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst
     ret i64 %new
 }
 
-define i64 @acquire_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acquire_monotonic_i64_generic(
+define i64 @seq_cst_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_monotonic_i64_generic_sys(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_generic_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_generic_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_generic_param_2];
-; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_sys_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire monotonic
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic
     ret i64 %new
 }
 
-define i64 @acquire_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acquire_monotonic_i64_global(
+define i64 @seq_cst_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_monotonic_i64_generic_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_global_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_global_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_global_param_2];
-; SM90-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_cta_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire monotonic
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic
     ret i64 %new
 }
 
-define i64 @acquire_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acquire_monotonic_i64_shared(
+define i64 @seq_cst_monotonic_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_monotonic_i64_generic_cluster(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_shared_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_shared_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_shared_param_2];
-; SM90-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_cluster_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire monotonic
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst monotonic
     ret i64 %new
 }
 
-define i64 @acquire_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acquire_acquire_i64_generic(
+define i64 @seq_cst_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_monotonic_i64_generic_gpu(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_generic_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_generic_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_generic_param_2];
-; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_gpu_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire acquire
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic
     ret i64 %new
 }
 
-define i64 @acquire_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acquire_acquire_i64_global(
+define i64 @seq_cst_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_monotonic_i64_global_sys(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_global_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_global_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_global_param_2];
-; SM90-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_sys_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic
     ret i64 %new
 }
 
-define i64 @acquire_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acquire_acquire_i64_shared(
+define i64 @seq_cst_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_monotonic_i64_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_shared_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_shared_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_shared_param_2];
-; SM90-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_cta_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic
     ret i64 %new
 }
 
-define i64 @acquire_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acquire_seq_cst_i64_generic(
+define i64 @seq_cst_monotonic_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_monotonic_i64_global_cluster(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_param_2];
-; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_cluster_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire seq_cst
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst monotonic
     ret i64 %new
 }
 
-define i64 @acquire_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acquire_seq_cst_i64_global(
+define i64 @seq_cst_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_monotonic_i64_global_gpu(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_global_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_global_param_2];
-; SM90-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_gpu_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire seq_cst
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic
     ret i64 %new
 }
 
-define i64 @acquire_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acquire_seq_cst_i64_shared(
+define i64 @seq_cst_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_monotonic_i64_shared_sys(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_param_0];
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_sys_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_param_2];
-; SM90-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_sys_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic
     ret i64 %new
 }
 
-define i64 @release_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: release_monotonic_i64_generic(
+define i64 @seq_cst_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_monotonic_i64_shared_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_generic_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_generic_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_generic_param_2];
-; SM90-NEXT:    atom.release.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_cta_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release monotonic
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic
     ret i64 %new
 }
 
-define i64 @release_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: release_monotonic_i64_global(
+define i64 @seq_cst_monotonic_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_monotonic_i64_shared_cluster(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_global_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_global_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_global_param_2];
-; SM90-NEXT:    atom.release.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_cluster_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release monotonic
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst monotonic
     ret i64 %new
 }
 
-define i64 @release_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: release_monotonic_i64_shared(
+define i64 @seq_cst_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_monotonic_i64_shared_gpu(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_shared_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_shared_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_shared_param_2];
-; SM90-NEXT:    atom.release.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_gpu_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release monotonic
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic
     ret i64 %new
 }
 
-define i64 @release_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: release_acquire_i64_generic(
+define i64 @seq_cst_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_acquire_i64_generic_sys(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_generic_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_generic_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_generic_param_2];
-; SM90-NEXT:    atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_sys_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release acquire
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire
     ret i64 %new
 }
 
-define i64 @release_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: release_acquire_i64_global(
+define i64 @seq_cst_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_acquire_i64_generic_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_global_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_global_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_global_param_2];
-; SM90-NEXT:    atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_cta_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release acquire
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire
     ret i64 %new
 }
 
-define i64 @release_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: release_acquire_i64_shared(
+define i64 @seq_cst_acquire_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_acquire_i64_generic_cluster(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_shared_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_shared_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_shared_param_2];
-; SM90-NEXT:    atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_cluster_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release acquire
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst acquire
     ret i64 %new
 }
 
-define i64 @release_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: release_seq_cst_i64_generic(
+define i64 @seq_cst_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_acquire_i64_generic_gpu(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_generic_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_generic_param_2];
-; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_gpu_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release seq_cst
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire
     ret i64 %new
 }
 
-define i64 @release_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: release_seq_cst_i64_global(
+define i64 @seq_cst_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_acquire_i64_global_sys(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_global_param_0];
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i64_global_sys_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_global_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_global_param_2];
-; SM90-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i64_global_sys_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_acquire_i64_global_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release seq_cst
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire
     ret i64 %new
 }
 
-define i64 @release_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: release_seq_cst_i64_shared(
+define i64 @seq_cst_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_acquire_i64_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_shared_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_shared_param_2];
-; SM90-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i64_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i64_global_cta_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_acquire_i64_global_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release seq_cst
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire
     ret i64 %new
 }
 
-define i64 @acq_rel_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acq_rel_monotonic_i64_generic(
+define i64 @seq_cst_acquire_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_acquire_i64_global_cluster(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_param_2];
-; SM90-NEXT:    atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i64_global_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i64_global_cluster_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_acquire_i64_global_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel monotonic
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst acquire
     ret i64 %new
 }
 
-define i64 @acq_rel_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acq_rel_monotonic_i64_global(
+define i64 @seq_cst_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_acquire_i64_global_gpu(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_param_2];
-; SM90-NEXT:    atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i64_global_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i64_global_gpu_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_acquire_i64_global_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel monotonic
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire
     ret i64 %new
 }
 
-define i64 @acq_rel_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acq_rel_monotonic_i64_shared(
+define i64 @seq_cst_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_acquire_i64_shared_sys(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_param_2];
-; SM90-NEXT:    atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_sys_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_sys_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel monotonic
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire
     ret i64 %new
 }
 
-define i64 @acq_rel_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acq_rel_acquire_i64_generic(
+define i64 @seq_cst_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_acquire_i64_shared_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_param_2];
-; SM90-NEXT:    atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_cta_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel acquire
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire
     ret i64 %new
 }
 
-define i64 @acq_rel_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acq_rel_acquire_i64_global(
+define i64 @seq_cst_acquire_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_acquire_i64_shared_cluster(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_global_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_global_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_global_param_2];
-; SM90-NEXT:    atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_cluster_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel acquire
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst acquire
     ret i64 %new
 }
 
-define i64 @acq_rel_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acq_rel_acquire_i64_shared(
+define i64 @seq_cst_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_acquire_i64_shared_gpu(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_param_2];
-; SM90-NEXT:    atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_gpu_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel acquire
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire
     ret i64 %new
 }
 
-define i64 @acq_rel_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acq_rel_seq_cst_i64_generic(
+define i64 @seq_cst_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i64_generic_sys(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_param_0];
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_sys_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_param_2];
-; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_sys_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel seq_cst
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst
     ret i64 %new
 }
 
-define i64 @acq_rel_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acq_rel_seq_cst_i64_global(
+define i64 @seq_cst_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i64_generic_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_param_2];
-; SM90-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_cta_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel seq_cst
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst
     ret i64 %new
 }
 
-define i64 @acq_rel_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acq_rel_seq_cst_i64_shared(
+define i64 @seq_cst_seq_cst_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i64_generic_cluster(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_param_2];
-; SM90-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_cluster_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel seq_cst
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst seq_cst
     ret i64 %new
 }
 
-define i64 @seq_cst_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: seq_cst_monotonic_i64_generic(
+define i64 @seq_cst_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i64_generic_gpu(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_param_2];
-; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_gpu_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst monotonic
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst
     ret i64 %new
 }
 
-define i64 @seq_cst_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: seq_cst_monotonic_i64_global(
+define i64 @seq_cst_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i64_global_sys(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_param_0];
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_sys_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_param_2];
-; SM90-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_sys_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst monotonic
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst
     ret i64 %new
 }
 
-define i64 @seq_cst_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: seq_cst_monotonic_i64_shared(
+define i64 @seq_cst_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i64_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_param_2];
-; SM90-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_cta_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst monotonic
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst
     ret i64 %new
 }
 
-define i64 @seq_cst_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: seq_cst_acquire_i64_generic(
+define i64 @seq_cst_seq_cst_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i64_global_cluster(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_param_2];
-; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_cluster_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst seq_cst
     ret i64 %new
 }
 
-define i64 @seq_cst_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: seq_cst_acquire_i64_global(
+define i64 @seq_cst_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i64_global_gpu(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_global_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_global_param_2];
-; SM90-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_gpu_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst
     ret i64 %new
 }
 
-define i64 @seq_cst_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: seq_cst_acquire_i64_shared(
+define i64 @seq_cst_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i64_shared_sys(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_param_0];
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_sys_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_param_2];
-; SM90-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_sys_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_sys_param_2];
+; SM90-NEXT:    atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst acquire
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst
     ret i64 %new
 }
 
-define i64 @seq_cst_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: seq_cst_seq_cst_i64_generic(
+define i64 @seq_cst_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i64_shared_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_param_2];
-; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_cta_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst
     ret i64 %new
 }
 
-define i64 @seq_cst_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: seq_cst_seq_cst_i64_global(
+define i64 @seq_cst_seq_cst_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i64_shared_cluster(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_param_2];
-; SM90-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_cluster_param_0];
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_cluster_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_cluster_param_2];
+; SM90-NEXT:    atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst seq_cst
     ret i64 %new
 }
 
-define i64 @seq_cst_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: seq_cst_seq_cst_i64_shared(
+define i64 @seq_cst_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i64_shared_gpu(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_param_2];
-; SM90-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_gpu_param_0];
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_gpu_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_gpu_param_2];
+; SM90-NEXT:    atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst seq_cst
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst
     ret i64 %new
 }
 
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.ll b/llvm/test/CodeGen/NVPTX/cmpxchg.ll
index 9eeff9d7c2b75..9d72c3f44d3a1 100644
--- a/llvm/test/CodeGen/NVPTX/cmpxchg.ll
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg.ll
@@ -79,7 +79,7 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB0_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -206,7 +206,7 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB1_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -336,7 +336,7 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB2_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -466,7 +466,7 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB3_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -598,7 +598,7 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB4_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -726,7 +726,7 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r16, %r19, %r3;
 ; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
 ; SM70-NEXT:    @%p1 bra $L__BB5_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -850,7 +850,7 @@ define i16 @acquire_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r16, %r19, %r3;
 ; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
 ; SM70-NEXT:    @%p1 bra $L__BB6_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -977,7 +977,7 @@ define i16 @release_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r16, %r19, %r3;
 ; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
 ; SM70-NEXT:    @%p1 bra $L__BB7_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1104,7 +1104,7 @@ define i16 @acq_rel_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r16, %r19, %r3;
 ; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
 ; SM70-NEXT:    @%p1 bra $L__BB8_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1234,7 +1234,7 @@ define i16 @seq_cst_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r16, %r19, %r3;
 ; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
 ; SM70-NEXT:    @%p1 bra $L__BB9_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1316,7 +1316,7 @@ define i32 @relaxed_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
 ; SM70-NEXT:    ld.param.b64 %rd1, [relaxed_sys_i32_param_0];
 ; SM70-NEXT:    ld.param.b32 %r1, [relaxed_sys_i32_param_1];
 ; SM70-NEXT:    ld.param.b32 %r2, [relaxed_sys_i32_param_2];
-; SM70-NEXT:    atom.relaxed.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
 ; SM90-LABEL: relaxed_sys_i32(
@@ -1358,7 +1358,7 @@ define i32 @acq_rel_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
 ; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_sys_i32_param_0];
 ; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_sys_i32_param_1];
 ; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_sys_i32_param_2];
-; SM70-NEXT:    atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
 ; SM90-LABEL: acq_rel_sys_i32(
@@ -1400,7 +1400,7 @@ define i32 @acquire_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
 ; SM70-NEXT:    ld.param.b64 %rd1, [acquire_sys_i32_param_0];
 ; SM70-NEXT:    ld.param.b32 %r1, [acquire_sys_i32_param_1];
 ; SM70-NEXT:    ld.param.b32 %r2, [acquire_sys_i32_param_2];
-; SM70-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
 ; SM90-LABEL: acquire_sys_i32(
@@ -1442,7 +1442,7 @@ define i32 @release_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
 ; SM70-NEXT:    ld.param.b64 %rd1, [release_sys_i32_param_0];
 ; SM70-NEXT:    ld.param.b32 %r1, [release_sys_i32_param_1];
 ; SM70-NEXT:    ld.param.b32 %r2, [release_sys_i32_param_2];
-; SM70-NEXT:    atom.release.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    atom.release.sys.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
 ; SM90-LABEL: release_sys_i32(
@@ -1486,7 +1486,7 @@ define i32 @seq_cst_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
 ; SM70-NEXT:    fence.sc.sys;
 ; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_sys_i32_param_1];
 ; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_sys_i32_param_2];
-; SM70-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
 ; SM90-LABEL: seq_cst_sys_i32(
@@ -1529,7 +1529,7 @@ define i64 @relaxed_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
 ; SM70-NEXT:    ld.param.b64 %rd1, [relaxed_sys_i64_param_0];
 ; SM70-NEXT:    ld.param.b64 %rd2, [relaxed_sys_i64_param_1];
 ; SM70-NEXT:    ld.param.b64 %rd3, [relaxed_sys_i64_param_2];
-; SM70-NEXT:    atom.relaxed.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    atom.relaxed.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
 ; SM90-LABEL: relaxed_sys_i64(
@@ -1568,7 +1568,7 @@ define i64 @acquire_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
 ; SM70-NEXT:    ld.param.b64 %rd1, [acquire_sys_i64_param_0];
 ; SM70-NEXT:    ld.param.b64 %rd2, [acquire_sys_i64_param_1];
 ; SM70-NEXT:    ld.param.b64 %rd3, [acquire_sys_i64_param_2];
-; SM70-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
 ; SM90-LABEL: acquire_sys_i64(
@@ -1607,7 +1607,7 @@ define i64 @acq_rel_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
 ; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_sys_i64_param_0];
 ; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_sys_i64_param_1];
 ; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_sys_i64_param_2];
-; SM70-NEXT:    atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
 ; SM90-LABEL: acq_rel_sys_i64(
@@ -1646,7 +1646,7 @@ define i64 @release_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
 ; SM70-NEXT:    ld.param.b64 %rd1, [release_sys_i64_param_0];
 ; SM70-NEXT:    ld.param.b64 %rd2, [release_sys_i64_param_1];
 ; SM70-NEXT:    ld.param.b64 %rd3, [release_sys_i64_param_2];
-; SM70-NEXT:    atom.release.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    atom.release.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
 ; SM90-LABEL: release_sys_i64(
@@ -1687,7 +1687,7 @@ define i64 @seq_cst_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
 ; SM70-NEXT:    fence.sc.sys;
 ; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_sys_i64_param_1];
 ; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_sys_i64_param_2];
-; SM70-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
 ; SM90-LABEL: seq_cst_sys_i64(
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.py b/llvm/test/CodeGen/NVPTX/cmpxchg.py
index ae7450015ecd2..277704bd9d5a5 100644
--- a/llvm/test/CodeGen/NVPTX/cmpxchg.py
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg.py
@@ -5,8 +5,8 @@
 from itertools import product
 
 cmpxchg_func = Template(
-    """define i$size @${success}_${failure}_i${size}_${addrspace}(ptr${addrspace_cast} %addr, i$size %cmp, i$size %new) {
-    %pairold = cmpxchg ptr${addrspace_cast} %addr, i$size %cmp, i$size %new $success $failure
+    """define i$size @${success}_${failure}_i${size}_${addrspace}_${ptx_scope}(ptr${addrspace_cast} %addr, i$size %cmp, i$size %new) {
+    %pairold = cmpxchg ptr${addrspace_cast} %addr, i$size %cmp, i$size %new syncscope(\"${llvm_scope}\") $success $failure
     ret i$size %new
 }
 """
@@ -38,9 +38,12 @@
     for sm, ptx in TESTS:
         with open("cmpxchg-sm{}.ll".format(str(sm)), "w") as fp:
             print(run_statement.substitute(sm=sm, ptx=ptx), file=fp)
-            for size, success, failure, addrspace in product(
-                SIZES, SUCCESS_ORDERINGS, FAILURE_ORDERINGS, ADDRSPACES
+            for size, success, failure, addrspace, llvm_scope in product(
+                SIZES, SUCCESS_ORDERINGS, FAILURE_ORDERINGS, ADDRSPACES, LLVM_SCOPES
             ):
+                # cluster ordering is supported from SM90 onwards
+                if sm != 90 and llvm_scope == "cluster":
+                    continue
                 if addrspace == 0:
                     addrspace_cast = ""
                 else:
@@ -52,6 +55,8 @@
                         size=size,
                         addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace],
                         addrspace_cast=addrspace_cast,
+                        llvm_scope=llvm_scope,
+                        ptx_scope=SCOPE_LLVM_TO_PTX[llvm_scope],
                     ),
                     file=fp,
                 )
diff --git a/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll b/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll
index cea3ac37c1964..45202ee010328 100644
--- a/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll
+++ b/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll
@@ -172,30 +172,30 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    ld.param.b64 %rd2, [test_distributed_shared_cluster_cmpxchg_param_0];
-; CHECK-NEXT:    atom.relaxed.shared::cluster.cas.b32 %r24, [%rd2], 1, 0;
-; CHECK-NEXT:    atom.acquire.shared::cluster.cas.b32 %r25, [%rd2], 1, 0;
-; CHECK-NEXT:    atom.acquire.shared::cluster.cas.b32 %r26, [%rd2], 1, 0;
-; CHECK-NEXT:    atom.release.shared::cluster.cas.b32 %r27, [%rd2], 1, 0;
-; CHECK-NEXT:    atom.acq_rel.shared::cluster.cas.b32 %r28, [%rd2], 1, 0;
-; CHECK-NEXT:    atom.acq_rel.shared::cluster.cas.b32 %r29, [%rd2], 1, 0;
+; CHECK-NEXT:    atom.relaxed.sys.shared::cluster.cas.b32 %r24, [%rd2], 1, 0;
+; CHECK-NEXT:    atom.acquire.sys.shared::cluster.cas.b32 %r25, [%rd2], 1, 0;
+; CHECK-NEXT:    atom.acquire.sys.shared::cluster.cas.b32 %r26, [%rd2], 1, 0;
+; CHECK-NEXT:    atom.release.sys.shared::cluster.cas.b32 %r27, [%rd2], 1, 0;
+; CHECK-NEXT:    atom.acq_rel.sys.shared::cluster.cas.b32 %r28, [%rd2], 1, 0;
+; CHECK-NEXT:    atom.acq_rel.sys.shared::cluster.cas.b32 %r29, [%rd2], 1, 0;
 ; CHECK-NEXT:    fence.sc.sys;
-; CHECK-NEXT:    atom.acquire.shared::cluster.cas.b32 %r30, [%rd2], 1, 0;
+; CHECK-NEXT:    atom.acquire.sys.shared::cluster.cas.b32 %r30, [%rd2], 1, 0;
 ; CHECK-NEXT:    fence.sc.sys;
-; CHECK-NEXT:    atom.acquire.shared::cluster.cas.b32 %r31, [%rd2], 1, 0;
+; CHECK-NEXT:    atom.acquire.sys.shared::cluster.cas.b32 %r31, [%rd2], 1, 0;
 ; CHECK-NEXT:    fence.sc.sys;
-; CHECK-NEXT:    atom.acquire.shared::cluster.cas.b32 %r32, [%rd2], 1, 0;
-; CHECK-NEXT:    atom.relaxed.shared::cluster.cas.b64 %rd3, [%rd2], 1, 0;
-; CHECK-NEXT:    atom.acquire.shared::cluster.cas.b64 %rd4, [%rd2], 1, 0;
-; CHECK-NEXT:    atom.acquire.shared::cluster.cas.b64 %rd5, [%rd2], 1, 0;
-; CHECK-NEXT:    atom.release.shared::cluster.cas.b64 %rd6, [%rd2], 1, 0;
-; CHECK-NEXT:    atom.acq_rel.shared::cluster.cas.b64 %rd7, [%rd2], 1, 0;
-; CHECK-NEXT:    atom.acq_rel.shared::cluster.cas.b64 %rd8, [%rd2], 1, 0;
+; CHECK-NEXT:    atom.acquire.sys.shared::cluster.cas.b32 %r32, [%rd2], 1, 0;
+; CHECK-NEXT:    atom.relaxed.sys.shared::cluster.cas.b64 %rd3, [%rd2], 1, 0;
+; CHECK-NEXT:    atom.acquire.sys.shared::cluster.cas.b64 %rd4, [%rd2], 1, 0;
+; CHECK-NEXT:    atom.acquire.sys.shared::cluster.cas.b64 %rd5, [%rd2], 1, 0;
+; CHECK-NEXT:    atom.release.sys.shared::cluster.cas.b64 %rd6, [%rd2], 1, 0;
+; CHECK-NEXT:    atom.acq_rel.sys.shared::cluster.cas.b64 %rd7, [%rd2], 1, 0;
+; CHECK-NEXT:    atom.acq_rel.sys.shared::cluster.cas.b64 %rd8, [%rd2], 1, 0;
 ; CHECK-NEXT:    fence.sc.sys;
-; CHECK-NEXT:    atom.acquire.shared::cluster.cas.b64 %rd9, [%rd2], 1, 0;
+; CHECK-NEXT:    atom.acquire.sys.shared::cluster.cas.b64 %rd9, [%rd2], 1, 0;
 ; CHECK-NEXT:    fence.sc.sys;
-; CHECK-NEXT:    atom.acquire.shared::cluster.cas.b64 %rd10, [%rd2], 1, 0;
+; CHECK-NEXT:    atom.acquire.sys.shared::cluster.cas.b64 %rd10, [%rd2], 1, 0;
 ; CHECK-NEXT:    fence.sc.sys;
-; CHECK-NEXT:    atom.acquire.shared::cluster.cas.b64 %rd11, [%rd2], 1, 0;
+; CHECK-NEXT:    atom.acquire.sys.shared::cluster.cas.b64 %rd11, [%rd2], 1, 0;
 ; CHECK-NEXT:    and.b64 %rd1, %rd2, -4;
 ; CHECK-NEXT:    cvt.u32.u64 %r33, %rd2;
 ; CHECK-NEXT:    and.b32 %r34, %r33, 3;
@@ -210,7 +210,7 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr
 ; CHECK-NEXT:  $L__BB4_1: // %partword.cmpxchg.loop33
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    or.b32 %r39, %r48, %r3;
-; CHECK-NEXT:    atom.relaxed.shared::cluster.cas.b32 %r6, [%rd1], %r39, %r48;
+; CHECK-NEXT:    atom.relaxed.sys.shared::cluster.cas.b32 %r6, [%rd1], %r39, %r48;
 ; CHECK-NEXT:    setp.eq.s32 %p1, %r6, %r39;
 ; CHECK-NEXT:    @%p1 bra $L__BB4_3;
 ; CHECK-NEXT:  // %bb.2: // %partword.cmpxchg.failure32
@@ -225,7 +225,7 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr
 ; CHECK-NEXT:  $L__BB4_4: // %partword.cmpxchg.loop23
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    or.b32 %r41, %r49, %r3;
-; CHECK-NEXT:    atom.relaxed.shared::cluster.cas.b32 %r10, [%rd1], %r41, %r49;
+; CHECK-NEXT:    atom.relaxed.sys.shared::cluster.cas.b32 %r10, [%rd1], %r41, %r49;
 ; CHECK-NEXT:    setp.eq.s32 %p3, %r10, %r41;
 ; CHECK-NEXT:    @%p3 bra $L__BB4_6;
 ; CHECK-NEXT:  // %bb.5: // %partword.cmpxchg.failure22
@@ -242,7 +242,7 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr
 ; CHECK-NEXT:  $L__BB4_7: // %partword.cmpxchg.loop13
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    or.b32 %r43, %r50, %r3;
-; CHECK-NEXT:    atom.relaxed.shared::cluster.cas.b32 %r14, [%rd1], %r43, %r50;
+; CHECK-NEXT:    atom.relaxed.sys.shared::cluster.cas.b32 %r14, [%rd1], %r43, %r50;
 ; CHECK-NEXT:    setp.eq.s32 %p5, %r14, %r43;
 ; CHECK-NEXT:    @%p5 bra $L__BB4_9;
 ; CHECK-NEXT:  // %bb.8: // %partword.cmpxchg.failure12
@@ -258,7 +258,7 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr
 ; CHECK-NEXT:  $L__BB4_10: // %partword.cmpxchg.loop3
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    or.b32 %r45, %r51, %r3;
-; CHECK-NEXT:    atom.relaxed.shared::cluster.cas.b32 %r18, [%rd1], %r45, %r51;
+; CHECK-NEXT:    atom.relaxed.sys.shared::cluster.cas.b32 %r18, [%rd1], %r45, %r51;
 ; CHECK-NEXT:    setp.eq.s32 %p7, %r18, %r45;
 ; CHECK-NEXT:    @%p7 bra $L__BB4_12;
 ; CHECK-NEXT:  // %bb.11: // %partword.cmpxchg.failure2
@@ -275,7 +275,7 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr
 ; CHECK-NEXT:  $L__BB4_13: // %partword.cmpxchg.loop
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    or.b32 %r47, %r52, %r3;
-; CHECK-NEXT:    atom.relaxed.shared::cluster.cas.b32 %r22, [%rd1], %r47, %r52;
+; CHECK-NEXT:    atom.relaxed.sys.shared::cluster.cas.b32 %r22, [%rd1], %r47, %r52;
 ; CHECK-NEXT:    setp.eq.s32 %p9, %r22, %r47;
 ; CHECK-NEXT:    @%p9 bra $L__BB4_15;
 ; CHECK-NEXT:  // %bb.14: // %partword.cmpxchg.failure



More information about the llvm-commits mailing list