[llvm-branch-commits] [llvm] 4a2e374 - Revert "[NVPTX] Weak cmpxchg unittests for NVPTX (#176078)"

via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Mon Jan 26 12:52:32 PST 2026


Author: Vladimir Vereschaka
Date: 2026-01-26T12:52:28-08:00
New Revision: 4a2e374ec05869267d33981bcfda09250f535ddf

URL: https://github.com/llvm/llvm-project/commit/4a2e374ec05869267d33981bcfda09250f535ddf
DIFF: https://github.com/llvm/llvm-project/commit/4a2e374ec05869267d33981bcfda09250f535ddf.diff

LOG: Revert "[NVPTX] Weak cmpxchg unittests for NVPTX (#176078)"

This reverts commit e936715b8c5e624b5d6cc1e2e50a32c394d03209.

Added: 
    

Modified: 
    llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll
    llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll
    llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll
    llvm/test/CodeGen/NVPTX/cmpxchg.py

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll
index 393c1ceae6916..d895c715ab3ce 100644
--- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll
@@ -1,3878 +1,2104 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_60 -mattr=+ptx50 | FileCheck %s --check-prefix=SM60
-; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_60 -mattr=+ptx50 | %ptxas-verify -arch=sm_60 %}
+; RUN: %if ptxas-sm_60 && ptxas-isa-5.0 %{ llc < %s -march=nvptx64 -mcpu=sm_60 -mattr=+ptx50 | %ptxas-verify -arch=sm_60 %}
 
-define i8 @weak_monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: weak_monotonic_monotonic_i8_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<17>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [weak_monotonic_monotonic_i8_global_cta_param_0];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r6, [weak_monotonic_monotonic_i8_global_cta_param_1];
-; SM60-NEXT:    ld.param.b8 %r7, [weak_monotonic_monotonic_i8_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM60-NEXT:    and.b32 %r9, %r8, 3;
-; SM60-NEXT:    shl.b32 %r1, %r9, 3;
-; SM60-NEXT:    mov.b32 %r10, 255;
-; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM60-NEXT:    not.b32 %r12, %r11;
-; SM60-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM60-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r4, %r13, %r12;
-; SM60-NEXT:    or.b32 %r14, %r4, %r2;
-; SM60-NEXT:    or.b32 %r15, %r4, %r3;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i8 @weak_monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: weak_monotonic_acquire_i8_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<17>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [weak_monotonic_acquire_i8_global_cta_param_0];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r6, [weak_monotonic_acquire_i8_global_cta_param_1];
-; SM60-NEXT:    ld.param.b8 %r7, [weak_monotonic_acquire_i8_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM60-NEXT:    and.b32 %r9, %r8, 3;
-; SM60-NEXT:    shl.b32 %r1, %r9, 3;
-; SM60-NEXT:    mov.b32 %r10, 255;
-; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM60-NEXT:    not.b32 %r12, %r11;
-; SM60-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM60-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r4, %r13, %r12;
-; SM60-NEXT:    or.b32 %r14, %r4, %r2;
-; SM60-NEXT:    or.b32 %r15, %r4, %r3;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i8 @weak_monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: weak_monotonic_seq_cst_i8_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<17>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [weak_monotonic_seq_cst_i8_global_cta_param_0];
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b8 %r6, [weak_monotonic_seq_cst_i8_global_cta_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r7, [weak_monotonic_seq_cst_i8_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM60-NEXT:    and.b32 %r9, %r8, 3;
-; SM60-NEXT:    shl.b32 %r1, %r9, 3;
-; SM60-NEXT:    mov.b32 %r10, 255;
-; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM60-NEXT:    not.b32 %r12, %r11;
-; SM60-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM60-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r4, %r13, %r12;
-; SM60-NEXT:    or.b32 %r14, %r4, %r2;
-; SM60-NEXT:    or.b32 %r15, %r4, %r3;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i8 @weak_acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: weak_acquire_monotonic_i8_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<17>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [weak_acquire_monotonic_i8_global_cta_param_0];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r6, [weak_acquire_monotonic_i8_global_cta_param_1];
-; SM60-NEXT:    ld.param.b8 %r7, [weak_acquire_monotonic_i8_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM60-NEXT:    and.b32 %r9, %r8, 3;
-; SM60-NEXT:    shl.b32 %r1, %r9, 3;
-; SM60-NEXT:    mov.b32 %r10, 255;
-; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM60-NEXT:    not.b32 %r12, %r11;
-; SM60-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM60-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r4, %r13, %r12;
-; SM60-NEXT:    or.b32 %r14, %r4, %r2;
-; SM60-NEXT:    or.b32 %r15, %r4, %r3;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i8 @weak_acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: weak_acquire_acquire_i8_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<17>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [weak_acquire_acquire_i8_global_cta_param_0];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r6, [weak_acquire_acquire_i8_global_cta_param_1];
-; SM60-NEXT:    ld.param.b8 %r7, [weak_acquire_acquire_i8_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM60-NEXT:    and.b32 %r9, %r8, 3;
-; SM60-NEXT:    shl.b32 %r1, %r9, 3;
-; SM60-NEXT:    mov.b32 %r10, 255;
-; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM60-NEXT:    not.b32 %r12, %r11;
-; SM60-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM60-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r4, %r13, %r12;
-; SM60-NEXT:    or.b32 %r14, %r4, %r2;
-; SM60-NEXT:    or.b32 %r15, %r4, %r3;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i8 @weak_acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: weak_acquire_seq_cst_i8_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<17>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [weak_acquire_seq_cst_i8_global_cta_param_0];
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b8 %r6, [weak_acquire_seq_cst_i8_global_cta_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r7, [weak_acquire_seq_cst_i8_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM60-NEXT:    and.b32 %r9, %r8, 3;
-; SM60-NEXT:    shl.b32 %r1, %r9, 3;
-; SM60-NEXT:    mov.b32 %r10, 255;
-; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM60-NEXT:    not.b32 %r12, %r11;
-; SM60-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM60-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r4, %r13, %r12;
-; SM60-NEXT:    or.b32 %r14, %r4, %r2;
-; SM60-NEXT:    or.b32 %r15, %r4, %r3;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i8 @weak_release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: weak_release_monotonic_i8_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<17>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [weak_release_monotonic_i8_global_cta_param_0];
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b8 %r6, [weak_release_monotonic_i8_global_cta_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r7, [weak_release_monotonic_i8_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM60-NEXT:    and.b32 %r9, %r8, 3;
-; SM60-NEXT:    shl.b32 %r1, %r9, 3;
-; SM60-NEXT:    mov.b32 %r10, 255;
-; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM60-NEXT:    not.b32 %r12, %r11;
-; SM60-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM60-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r4, %r13, %r12;
-; SM60-NEXT:    or.b32 %r14, %r4, %r2;
-; SM60-NEXT:    or.b32 %r15, %r4, %r3;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i8 @weak_release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: weak_release_acquire_i8_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<17>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [weak_release_acquire_i8_global_cta_param_0];
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b8 %r6, [weak_release_acquire_i8_global_cta_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r7, [weak_release_acquire_i8_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM60-NEXT:    and.b32 %r9, %r8, 3;
-; SM60-NEXT:    shl.b32 %r1, %r9, 3;
-; SM60-NEXT:    mov.b32 %r10, 255;
-; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM60-NEXT:    not.b32 %r12, %r11;
-; SM60-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM60-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r4, %r13, %r12;
-; SM60-NEXT:    or.b32 %r14, %r4, %r2;
-; SM60-NEXT:    or.b32 %r15, %r4, %r3;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i8 @weak_release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: weak_release_seq_cst_i8_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<17>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [weak_release_seq_cst_i8_global_cta_param_0];
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b8 %r6, [weak_release_seq_cst_i8_global_cta_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r7, [weak_release_seq_cst_i8_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM60-NEXT:    and.b32 %r9, %r8, 3;
-; SM60-NEXT:    shl.b32 %r1, %r9, 3;
-; SM60-NEXT:    mov.b32 %r10, 255;
-; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM60-NEXT:    not.b32 %r12, %r11;
-; SM60-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM60-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r4, %r13, %r12;
-; SM60-NEXT:    or.b32 %r14, %r4, %r2;
-; SM60-NEXT:    or.b32 %r15, %r4, %r3;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i8 @weak_acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: weak_acq_rel_monotonic_i8_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<17>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [weak_acq_rel_monotonic_i8_global_cta_param_0];
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b8 %r6, [weak_acq_rel_monotonic_i8_global_cta_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r7, [weak_acq_rel_monotonic_i8_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM60-NEXT:    and.b32 %r9, %r8, 3;
-; SM60-NEXT:    shl.b32 %r1, %r9, 3;
-; SM60-NEXT:    mov.b32 %r10, 255;
-; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM60-NEXT:    not.b32 %r12, %r11;
-; SM60-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM60-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r4, %r13, %r12;
-; SM60-NEXT:    or.b32 %r14, %r4, %r2;
-; SM60-NEXT:    or.b32 %r15, %r4, %r3;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i8 @weak_acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: weak_acq_rel_acquire_i8_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<17>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [weak_acq_rel_acquire_i8_global_cta_param_0];
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b8 %r6, [weak_acq_rel_acquire_i8_global_cta_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r7, [weak_acq_rel_acquire_i8_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM60-NEXT:    and.b32 %r9, %r8, 3;
-; SM60-NEXT:    shl.b32 %r1, %r9, 3;
-; SM60-NEXT:    mov.b32 %r10, 255;
-; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM60-NEXT:    not.b32 %r12, %r11;
-; SM60-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM60-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r4, %r13, %r12;
-; SM60-NEXT:    or.b32 %r14, %r4, %r2;
-; SM60-NEXT:    or.b32 %r15, %r4, %r3;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i8 @weak_acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: weak_acq_rel_seq_cst_i8_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<17>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [weak_acq_rel_seq_cst_i8_global_cta_param_0];
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b8 %r6, [weak_acq_rel_seq_cst_i8_global_cta_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r7, [weak_acq_rel_seq_cst_i8_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM60-NEXT:    and.b32 %r9, %r8, 3;
-; SM60-NEXT:    shl.b32 %r1, %r9, 3;
-; SM60-NEXT:    mov.b32 %r10, 255;
-; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM60-NEXT:    not.b32 %r12, %r11;
-; SM60-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM60-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r4, %r13, %r12;
-; SM60-NEXT:    or.b32 %r14, %r4, %r2;
-; SM60-NEXT:    or.b32 %r15, %r4, %r3;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i8 @weak_seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: weak_seq_cst_monotonic_i8_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<17>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [weak_seq_cst_monotonic_i8_global_cta_param_0];
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b8 %r6, [weak_seq_cst_monotonic_i8_global_cta_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r7, [weak_seq_cst_monotonic_i8_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM60-NEXT:    and.b32 %r9, %r8, 3;
-; SM60-NEXT:    shl.b32 %r1, %r9, 3;
-; SM60-NEXT:    mov.b32 %r10, 255;
-; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM60-NEXT:    not.b32 %r12, %r11;
-; SM60-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM60-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r4, %r13, %r12;
-; SM60-NEXT:    or.b32 %r14, %r4, %r2;
-; SM60-NEXT:    or.b32 %r15, %r4, %r3;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i8 @weak_seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: weak_seq_cst_acquire_i8_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<17>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [weak_seq_cst_acquire_i8_global_cta_param_0];
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b8 %r6, [weak_seq_cst_acquire_i8_global_cta_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r7, [weak_seq_cst_acquire_i8_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM60-NEXT:    and.b32 %r9, %r8, 3;
-; SM60-NEXT:    shl.b32 %r1, %r9, 3;
-; SM60-NEXT:    mov.b32 %r10, 255;
-; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM60-NEXT:    not.b32 %r12, %r11;
-; SM60-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM60-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r4, %r13, %r12;
-; SM60-NEXT:    or.b32 %r14, %r4, %r2;
-; SM60-NEXT:    or.b32 %r15, %r4, %r3;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i8 @weak_seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: weak_seq_cst_seq_cst_i8_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<17>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [weak_seq_cst_seq_cst_i8_global_cta_param_0];
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b8 %r6, [weak_seq_cst_seq_cst_i8_global_cta_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r7, [weak_seq_cst_seq_cst_i8_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM60-NEXT:    and.b32 %r9, %r8, 3;
-; SM60-NEXT:    shl.b32 %r1, %r9, 3;
-; SM60-NEXT:    mov.b32 %r10, 255;
-; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM60-NEXT:    not.b32 %r12, %r11;
-; SM60-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM60-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r4, %r13, %r12;
-; SM60-NEXT:    or.b32 %r14, %r4, %r2;
-; SM60-NEXT:    or.b32 %r15, %r4, %r3;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i16 @weak_monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: weak_monotonic_monotonic_i16_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<17>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [weak_monotonic_monotonic_i16_global_cta_param_0];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r6, [weak_monotonic_monotonic_i16_global_cta_param_1];
-; SM60-NEXT:    ld.param.b16 %r7, [weak_monotonic_monotonic_i16_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM60-NEXT:    and.b32 %r9, %r8, 3;
-; SM60-NEXT:    shl.b32 %r1, %r9, 3;
-; SM60-NEXT:    mov.b32 %r10, 65535;
-; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM60-NEXT:    not.b32 %r12, %r11;
-; SM60-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM60-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r4, %r13, %r12;
-; SM60-NEXT:    or.b32 %r14, %r4, %r2;
-; SM60-NEXT:    or.b32 %r15, %r4, %r3;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
-}
-
-define i16 @weak_monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: weak_monotonic_acquire_i16_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<17>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [weak_monotonic_acquire_i16_global_cta_param_0];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r6, [weak_monotonic_acquire_i16_global_cta_param_1];
-; SM60-NEXT:    ld.param.b16 %r7, [weak_monotonic_acquire_i16_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM60-NEXT:    and.b32 %r9, %r8, 3;
-; SM60-NEXT:    shl.b32 %r1, %r9, 3;
-; SM60-NEXT:    mov.b32 %r10, 65535;
-; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM60-NEXT:    not.b32 %r12, %r11;
-; SM60-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM60-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r4, %r13, %r12;
-; SM60-NEXT:    or.b32 %r14, %r4, %r2;
-; SM60-NEXT:    or.b32 %r15, %r4, %r3;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
-}
-
-define i16 @weak_monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: weak_monotonic_seq_cst_i16_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<17>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [weak_monotonic_seq_cst_i16_global_cta_param_0];
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b16 %r6, [weak_monotonic_seq_cst_i16_global_cta_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r7, [weak_monotonic_seq_cst_i16_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM60-NEXT:    and.b32 %r9, %r8, 3;
-; SM60-NEXT:    shl.b32 %r1, %r9, 3;
-; SM60-NEXT:    mov.b32 %r10, 65535;
-; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM60-NEXT:    not.b32 %r12, %r11;
-; SM60-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM60-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r4, %r13, %r12;
-; SM60-NEXT:    or.b32 %r14, %r4, %r2;
-; SM60-NEXT:    or.b32 %r15, %r4, %r3;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
-}
-
-define i16 @weak_acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: weak_acquire_monotonic_i16_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<17>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [weak_acquire_monotonic_i16_global_cta_param_0];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r6, [weak_acquire_monotonic_i16_global_cta_param_1];
-; SM60-NEXT:    ld.param.b16 %r7, [weak_acquire_monotonic_i16_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM60-NEXT:    and.b32 %r9, %r8, 3;
-; SM60-NEXT:    shl.b32 %r1, %r9, 3;
-; SM60-NEXT:    mov.b32 %r10, 65535;
-; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM60-NEXT:    not.b32 %r12, %r11;
-; SM60-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM60-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r4, %r13, %r12;
-; SM60-NEXT:    or.b32 %r14, %r4, %r2;
-; SM60-NEXT:    or.b32 %r15, %r4, %r3;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
-}
-
-define i16 @weak_acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: weak_acquire_acquire_i16_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<17>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [weak_acquire_acquire_i16_global_cta_param_0];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r6, [weak_acquire_acquire_i16_global_cta_param_1];
-; SM60-NEXT:    ld.param.b16 %r7, [weak_acquire_acquire_i16_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM60-NEXT:    and.b32 %r9, %r8, 3;
-; SM60-NEXT:    shl.b32 %r1, %r9, 3;
-; SM60-NEXT:    mov.b32 %r10, 65535;
-; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM60-NEXT:    not.b32 %r12, %r11;
-; SM60-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM60-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r4, %r13, %r12;
-; SM60-NEXT:    or.b32 %r14, %r4, %r2;
-; SM60-NEXT:    or.b32 %r15, %r4, %r3;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
-}
-
-define i16 @weak_acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: weak_acquire_seq_cst_i16_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<17>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [weak_acquire_seq_cst_i16_global_cta_param_0];
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b16 %r6, [weak_acquire_seq_cst_i16_global_cta_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r7, [weak_acquire_seq_cst_i16_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM60-NEXT:    and.b32 %r9, %r8, 3;
-; SM60-NEXT:    shl.b32 %r1, %r9, 3;
-; SM60-NEXT:    mov.b32 %r10, 65535;
-; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM60-NEXT:    not.b32 %r12, %r11;
-; SM60-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM60-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r4, %r13, %r12;
-; SM60-NEXT:    or.b32 %r14, %r4, %r2;
-; SM60-NEXT:    or.b32 %r15, %r4, %r3;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
-}
-
-define i16 @weak_release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: weak_release_monotonic_i16_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<17>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [weak_release_monotonic_i16_global_cta_param_0];
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b16 %r6, [weak_release_monotonic_i16_global_cta_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r7, [weak_release_monotonic_i16_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM60-NEXT:    and.b32 %r9, %r8, 3;
-; SM60-NEXT:    shl.b32 %r1, %r9, 3;
-; SM60-NEXT:    mov.b32 %r10, 65535;
-; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM60-NEXT:    not.b32 %r12, %r11;
-; SM60-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM60-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r4, %r13, %r12;
-; SM60-NEXT:    or.b32 %r14, %r4, %r2;
-; SM60-NEXT:    or.b32 %r15, %r4, %r3;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
-}
-
-define i16 @weak_release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: weak_release_acquire_i16_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<17>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [weak_release_acquire_i16_global_cta_param_0];
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b16 %r6, [weak_release_acquire_i16_global_cta_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r7, [weak_release_acquire_i16_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM60-NEXT:    and.b32 %r9, %r8, 3;
-; SM60-NEXT:    shl.b32 %r1, %r9, 3;
-; SM60-NEXT:    mov.b32 %r10, 65535;
-; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM60-NEXT:    not.b32 %r12, %r11;
-; SM60-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM60-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r4, %r13, %r12;
-; SM60-NEXT:    or.b32 %r14, %r4, %r2;
-; SM60-NEXT:    or.b32 %r15, %r4, %r3;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release acquire
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
-}
-
-define i16 @weak_release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: weak_release_seq_cst_i16_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<17>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [weak_release_seq_cst_i16_global_cta_param_0];
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b16 %r6, [weak_release_seq_cst_i16_global_cta_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r7, [weak_release_seq_cst_i16_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM60-NEXT:    and.b32 %r9, %r8, 3;
-; SM60-NEXT:    shl.b32 %r1, %r9, 3;
-; SM60-NEXT:    mov.b32 %r10, 65535;
-; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM60-NEXT:    not.b32 %r12, %r11;
-; SM60-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM60-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r4, %r13, %r12;
-; SM60-NEXT:    or.b32 %r14, %r4, %r2;
-; SM60-NEXT:    or.b32 %r15, %r4, %r3;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
-}
-
-define i16 @weak_acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: weak_acq_rel_monotonic_i16_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<17>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [weak_acq_rel_monotonic_i16_global_cta_param_0];
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b16 %r6, [weak_acq_rel_monotonic_i16_global_cta_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r7, [weak_acq_rel_monotonic_i16_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM60-NEXT:    and.b32 %r9, %r8, 3;
-; SM60-NEXT:    shl.b32 %r1, %r9, 3;
-; SM60-NEXT:    mov.b32 %r10, 65535;
-; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM60-NEXT:    not.b32 %r12, %r11;
-; SM60-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM60-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r4, %r13, %r12;
-; SM60-NEXT:    or.b32 %r14, %r4, %r2;
-; SM60-NEXT:    or.b32 %r15, %r4, %r3;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
-}
-
-define i16 @weak_acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: weak_acq_rel_acquire_i16_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<17>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [weak_acq_rel_acquire_i16_global_cta_param_0];
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b16 %r6, [weak_acq_rel_acquire_i16_global_cta_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r7, [weak_acq_rel_acquire_i16_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM60-NEXT:    and.b32 %r9, %r8, 3;
-; SM60-NEXT:    shl.b32 %r1, %r9, 3;
-; SM60-NEXT:    mov.b32 %r10, 65535;
-; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM60-NEXT:    not.b32 %r12, %r11;
-; SM60-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM60-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r4, %r13, %r12;
-; SM60-NEXT:    or.b32 %r14, %r4, %r2;
-; SM60-NEXT:    or.b32 %r15, %r4, %r3;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
-}
-
-define i16 @weak_acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: weak_acq_rel_seq_cst_i16_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<17>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [weak_acq_rel_seq_cst_i16_global_cta_param_0];
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b16 %r6, [weak_acq_rel_seq_cst_i16_global_cta_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r7, [weak_acq_rel_seq_cst_i16_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM60-NEXT:    and.b32 %r9, %r8, 3;
-; SM60-NEXT:    shl.b32 %r1, %r9, 3;
-; SM60-NEXT:    mov.b32 %r10, 65535;
-; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM60-NEXT:    not.b32 %r12, %r11;
-; SM60-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM60-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r4, %r13, %r12;
-; SM60-NEXT:    or.b32 %r14, %r4, %r2;
-; SM60-NEXT:    or.b32 %r15, %r4, %r3;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
-}
-
-define i16 @weak_seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: weak_seq_cst_monotonic_i16_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<17>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [weak_seq_cst_monotonic_i16_global_cta_param_0];
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b16 %r6, [weak_seq_cst_monotonic_i16_global_cta_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r7, [weak_seq_cst_monotonic_i16_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM60-NEXT:    and.b32 %r9, %r8, 3;
-; SM60-NEXT:    shl.b32 %r1, %r9, 3;
-; SM60-NEXT:    mov.b32 %r10, 65535;
-; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM60-NEXT:    not.b32 %r12, %r11;
-; SM60-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM60-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r4, %r13, %r12;
-; SM60-NEXT:    or.b32 %r14, %r4, %r2;
-; SM60-NEXT:    or.b32 %r15, %r4, %r3;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
-}
-
-define i16 @weak_seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: weak_seq_cst_acquire_i16_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<17>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [weak_seq_cst_acquire_i16_global_cta_param_0];
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b16 %r6, [weak_seq_cst_acquire_i16_global_cta_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r7, [weak_seq_cst_acquire_i16_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM60-NEXT:    and.b32 %r9, %r8, 3;
-; SM60-NEXT:    shl.b32 %r1, %r9, 3;
-; SM60-NEXT:    mov.b32 %r10, 65535;
-; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM60-NEXT:    not.b32 %r12, %r11;
-; SM60-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM60-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r4, %r13, %r12;
-; SM60-NEXT:    or.b32 %r14, %r4, %r2;
-; SM60-NEXT:    or.b32 %r15, %r4, %r3;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
-}
-
-define i16 @weak_seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: weak_seq_cst_seq_cst_i16_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<17>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [weak_seq_cst_seq_cst_i16_global_cta_param_0];
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b16 %r6, [weak_seq_cst_seq_cst_i16_global_cta_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r7, [weak_seq_cst_seq_cst_i16_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM60-NEXT:    and.b32 %r9, %r8, 3;
-; SM60-NEXT:    shl.b32 %r1, %r9, 3;
-; SM60-NEXT:    mov.b32 %r10, 65535;
-; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM60-NEXT:    not.b32 %r12, %r11;
-; SM60-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM60-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r4, %r13, %r12;
-; SM60-NEXT:    or.b32 %r14, %r4, %r2;
-; SM60-NEXT:    or.b32 %r15, %r4, %r3;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
-}
-
-define i32 @weak_monotonic_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: weak_monotonic_monotonic_i32_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [weak_monotonic_monotonic_i32_global_cta_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [weak_monotonic_monotonic_i32_global_cta_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [weak_monotonic_monotonic_i32_global_cta_param_2];
-; SM60-NEXT:    atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_monotonic_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: weak_monotonic_acquire_i32_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [weak_monotonic_acquire_i32_global_cta_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [weak_monotonic_acquire_i32_global_cta_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [weak_monotonic_acquire_i32_global_cta_param_2];
-; SM60-NEXT:    atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_monotonic_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: weak_monotonic_seq_cst_i32_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [weak_monotonic_seq_cst_i32_global_cta_param_0];
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b32 %r1, [weak_monotonic_seq_cst_i32_global_cta_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [weak_monotonic_seq_cst_i32_global_cta_param_2];
-; SM60-NEXT:    atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_acquire_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: weak_acquire_monotonic_i32_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [weak_acquire_monotonic_i32_global_cta_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [weak_acquire_monotonic_i32_global_cta_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [weak_acquire_monotonic_i32_global_cta_param_2];
-; SM60-NEXT:    atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_acquire_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: weak_acquire_acquire_i32_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [weak_acquire_acquire_i32_global_cta_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [weak_acquire_acquire_i32_global_cta_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [weak_acquire_acquire_i32_global_cta_param_2];
-; SM60-NEXT:    atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_acquire_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: weak_acquire_seq_cst_i32_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [weak_acquire_seq_cst_i32_global_cta_param_0];
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b32 %r1, [weak_acquire_seq_cst_i32_global_cta_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [weak_acquire_seq_cst_i32_global_cta_param_2];
-; SM60-NEXT:    atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_release_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: weak_release_monotonic_i32_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [weak_release_monotonic_i32_global_cta_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [weak_release_monotonic_i32_global_cta_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [weak_release_monotonic_i32_global_cta_param_2];
-; SM60-NEXT:    atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_release_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: weak_release_acquire_i32_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [weak_release_acquire_i32_global_cta_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [weak_release_acquire_i32_global_cta_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [weak_release_acquire_i32_global_cta_param_2];
-; SM60-NEXT:    atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_release_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: weak_release_seq_cst_i32_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [weak_release_seq_cst_i32_global_cta_param_0];
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b32 %r1, [weak_release_seq_cst_i32_global_cta_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [weak_release_seq_cst_i32_global_cta_param_2];
-; SM60-NEXT:    atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_acq_rel_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: weak_acq_rel_monotonic_i32_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [weak_acq_rel_monotonic_i32_global_cta_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [weak_acq_rel_monotonic_i32_global_cta_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [weak_acq_rel_monotonic_i32_global_cta_param_2];
-; SM60-NEXT:    atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_acq_rel_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: weak_acq_rel_acquire_i32_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [weak_acq_rel_acquire_i32_global_cta_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [weak_acq_rel_acquire_i32_global_cta_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [weak_acq_rel_acquire_i32_global_cta_param_2];
-; SM60-NEXT:    atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_acq_rel_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: weak_acq_rel_seq_cst_i32_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [weak_acq_rel_seq_cst_i32_global_cta_param_0];
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b32 %r1, [weak_acq_rel_seq_cst_i32_global_cta_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [weak_acq_rel_seq_cst_i32_global_cta_param_2];
-; SM60-NEXT:    atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_seq_cst_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: weak_seq_cst_monotonic_i32_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [weak_seq_cst_monotonic_i32_global_cta_param_0];
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b32 %r1, [weak_seq_cst_monotonic_i32_global_cta_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [weak_seq_cst_monotonic_i32_global_cta_param_2];
-; SM60-NEXT:    atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_seq_cst_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: weak_seq_cst_acquire_i32_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [weak_seq_cst_acquire_i32_global_cta_param_0];
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b32 %r1, [weak_seq_cst_acquire_i32_global_cta_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [weak_seq_cst_acquire_i32_global_cta_param_2];
-; SM60-NEXT:    atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_seq_cst_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: weak_seq_cst_seq_cst_i32_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [weak_seq_cst_seq_cst_i32_global_cta_param_0];
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b32 %r1, [weak_seq_cst_seq_cst_i32_global_cta_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [weak_seq_cst_seq_cst_i32_global_cta_param_2];
-; SM60-NEXT:    atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i64 @weak_monotonic_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: weak_monotonic_monotonic_i64_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<5>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [weak_monotonic_monotonic_i64_global_cta_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [weak_monotonic_monotonic_i64_global_cta_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [weak_monotonic_monotonic_i64_global_cta_param_2];
-; SM60-NEXT:    atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd4;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
-}
-
-define i64 @weak_monotonic_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: weak_monotonic_acquire_i64_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<5>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [weak_monotonic_acquire_i64_global_cta_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [weak_monotonic_acquire_i64_global_cta_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [weak_monotonic_acquire_i64_global_cta_param_2];
-; SM60-NEXT:    atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd4;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
-}
-
-define i64 @weak_monotonic_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: weak_monotonic_seq_cst_i64_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<5>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [weak_monotonic_seq_cst_i64_global_cta_param_0];
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b64 %rd2, [weak_monotonic_seq_cst_i64_global_cta_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [weak_monotonic_seq_cst_i64_global_cta_param_2];
-; SM60-NEXT:    atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd4;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
-}
-
-define i64 @weak_acquire_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: weak_acquire_monotonic_i64_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<5>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [weak_acquire_monotonic_i64_global_cta_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [weak_acquire_monotonic_i64_global_cta_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [weak_acquire_monotonic_i64_global_cta_param_2];
-; SM60-NEXT:    atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd4;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
-}
-
-define i64 @weak_acquire_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: weak_acquire_acquire_i64_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<5>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [weak_acquire_acquire_i64_global_cta_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [weak_acquire_acquire_i64_global_cta_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [weak_acquire_acquire_i64_global_cta_param_2];
-; SM60-NEXT:    atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd4;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
-}
-
-define i64 @weak_acquire_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: weak_acquire_seq_cst_i64_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<5>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [weak_acquire_seq_cst_i64_global_cta_param_0];
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b64 %rd2, [weak_acquire_seq_cst_i64_global_cta_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [weak_acquire_seq_cst_i64_global_cta_param_2];
-; SM60-NEXT:    atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd4;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
-}
-
-define i64 @weak_release_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: weak_release_monotonic_i64_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<5>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [weak_release_monotonic_i64_global_cta_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [weak_release_monotonic_i64_global_cta_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [weak_release_monotonic_i64_global_cta_param_2];
-; SM60-NEXT:    atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd4;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
-}
-
-define i64 @weak_release_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: weak_release_acquire_i64_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<5>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [weak_release_acquire_i64_global_cta_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [weak_release_acquire_i64_global_cta_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [weak_release_acquire_i64_global_cta_param_2];
-; SM60-NEXT:    atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd4;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release acquire
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
-}
-
-define i64 @weak_release_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: weak_release_seq_cst_i64_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<5>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [weak_release_seq_cst_i64_global_cta_param_0];
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b64 %rd2, [weak_release_seq_cst_i64_global_cta_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [weak_release_seq_cst_i64_global_cta_param_2];
-; SM60-NEXT:    atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd4;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
-}
-
-define i64 @weak_acq_rel_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: weak_acq_rel_monotonic_i64_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<5>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [weak_acq_rel_monotonic_i64_global_cta_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [weak_acq_rel_monotonic_i64_global_cta_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [weak_acq_rel_monotonic_i64_global_cta_param_2];
-; SM60-NEXT:    atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd4;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
-}
-
-define i64 @weak_acq_rel_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: weak_acq_rel_acquire_i64_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<5>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [weak_acq_rel_acquire_i64_global_cta_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [weak_acq_rel_acquire_i64_global_cta_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [weak_acq_rel_acquire_i64_global_cta_param_2];
-; SM60-NEXT:    atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd4;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
-}
-
-define i64 @weak_acq_rel_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: weak_acq_rel_seq_cst_i64_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<5>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [weak_acq_rel_seq_cst_i64_global_cta_param_0];
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b64 %rd2, [weak_acq_rel_seq_cst_i64_global_cta_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [weak_acq_rel_seq_cst_i64_global_cta_param_2];
-; SM60-NEXT:    atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd4;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
-}
-
-define i64 @weak_seq_cst_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: weak_seq_cst_monotonic_i64_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<5>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [weak_seq_cst_monotonic_i64_global_cta_param_0];
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b64 %rd2, [weak_seq_cst_monotonic_i64_global_cta_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [weak_seq_cst_monotonic_i64_global_cta_param_2];
-; SM60-NEXT:    atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd4;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
-}
-
-define i64 @weak_seq_cst_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: weak_seq_cst_acquire_i64_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<5>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [weak_seq_cst_acquire_i64_global_cta_param_0];
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b64 %rd2, [weak_seq_cst_acquire_i64_global_cta_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [weak_seq_cst_acquire_i64_global_cta_param_2];
-; SM60-NEXT:    atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd4;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
-}
-
-define i64 @weak_seq_cst_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: weak_seq_cst_seq_cst_i64_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<5>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [weak_seq_cst_seq_cst_i64_global_cta_param_0];
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b64 %rd2, [weak_seq_cst_seq_cst_i64_global_cta_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [weak_seq_cst_seq_cst_i64_global_cta_param_2];
-; SM60-NEXT:    atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd4;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
-}
-
-define i8 @weak_acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: weak_acq_rel_acquire_i8_global(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<17>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [weak_acq_rel_acquire_i8_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b8 %r6, [weak_acq_rel_acquire_i8_global_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r7, [weak_acq_rel_acquire_i8_global_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM60-NEXT:    and.b32 %r9, %r8, 3;
-; SM60-NEXT:    shl.b32 %r1, %r9, 3;
-; SM60-NEXT:    mov.b32 %r10, 255;
-; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM60-NEXT:    not.b32 %r12, %r11;
-; SM60-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM60-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r4, %r13, %r12;
-; SM60-NEXT:    or.b32 %r14, %r4, %r2;
-; SM60-NEXT:    or.b32 %r15, %r4, %r3;
-; SM60-NEXT:    atom.sys.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i32 @weak_acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: weak_acq_rel_acquire_i32_global(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [weak_acq_rel_acquire_i32_global_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [weak_acq_rel_acquire_i32_global_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [weak_acq_rel_acquire_i32_global_param_2];
-; SM60-NEXT:    atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_acq_rel_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: weak_acq_rel_acquire_i32_global_sys(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [weak_acq_rel_acquire_i32_global_sys_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [weak_acq_rel_acquire_i32_global_sys_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [weak_acq_rel_acquire_i32_global_sys_param_2];
-; SM60-NEXT:    atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_acq_rel_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: weak_acq_rel_acquire_i32_global_gpu(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [weak_acq_rel_acquire_i32_global_gpu_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [weak_acq_rel_acquire_i32_global_gpu_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [weak_acq_rel_acquire_i32_global_gpu_param_2];
-; SM60-NEXT:    atom.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i8 @weak_acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: weak_acq_rel_acquire_i8_generic_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<17>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [weak_acq_rel_acquire_i8_generic_cta_param_0];
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b8 %r6, [weak_acq_rel_acquire_i8_generic_cta_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r7, [weak_acq_rel_acquire_i8_generic_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM60-NEXT:    and.b32 %r9, %r8, 3;
-; SM60-NEXT:    shl.b32 %r1, %r9, 3;
-; SM60-NEXT:    mov.b32 %r10, 255;
-; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM60-NEXT:    not.b32 %r12, %r11;
-; SM60-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM60-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM60-NEXT:    ld.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r4, %r13, %r12;
-; SM60-NEXT:    or.b32 %r14, %r4, %r2;
-; SM60-NEXT:    or.b32 %r15, %r4, %r3;
-; SM60-NEXT:    atom.cta.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i8 @weak_acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: weak_acq_rel_acquire_i8_shared_cta(
+define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_monotonic_i8_global_cta(
 ; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
 ; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [weak_acq_rel_acquire_i8_shared_cta_param_0];
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b8 %r6, [weak_acq_rel_acquire_i8_shared_cta_param_1];
+; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cta_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r7, [weak_acq_rel_acquire_i8_shared_cta_param_2];
+; SM60-NEXT:    ld.param.b8 %r7, [monotonic_monotonic_i8_global_cta_param_1];
 ; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
 ; SM60-NEXT:    and.b32 %r9, %r8, 3;
 ; SM60-NEXT:    shl.b32 %r1, %r9, 3;
 ; SM60-NEXT:    mov.b32 %r10, 255;
 ; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM60-NEXT:    not.b32 %r12, %r11;
-; SM60-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM60-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM60-NEXT:    ld.shared.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r4, %r13, %r12;
-; SM60-NEXT:    or.b32 %r14, %r4, %r2;
-; SM60-NEXT:    or.b32 %r15, %r4, %r3;
-; SM60-NEXT:    atom.cta.shared.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i32 @weak_acq_rel_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: weak_acq_rel_acquire_i32_generic_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [weak_acq_rel_acquire_i32_generic_cta_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [weak_acq_rel_acquire_i32_generic_cta_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [weak_acq_rel_acquire_i32_generic_cta_param_2];
-; SM60-NEXT:    atom.cta.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_acq_rel_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: weak_acq_rel_acquire_i32_shared_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [weak_acq_rel_acquire_i32_shared_cta_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [weak_acq_rel_acquire_i32_shared_cta_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [weak_acq_rel_acquire_i32_shared_cta_param_2];
-; SM60-NEXT:    atom.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i8 @strong_monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: strong_monotonic_monotonic_i8_global_cta(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b32 %r<18>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [strong_monotonic_monotonic_i8_global_cta_param_0];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r7, [strong_monotonic_monotonic_i8_global_cta_param_1];
-; SM60-NEXT:    ld.param.b8 %r8, [strong_monotonic_monotonic_i8_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r17, %r13, %r2;
-; SM60-NEXT:  $L__BB68_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
+; SM60-NEXT:  $L__BB0_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r14, %r17, %r3;
-; SM60-NEXT:    or.b32 %r15, %r17, %r4;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM60-NEXT:    @%p1 bra $L__BB68_3;
+; SM60-NEXT:    @%p1 bra $L__BB0_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB68_1 Depth=1
+; SM60-NEXT:    // in Loop: Header=BB0_1 Depth=1
 ; SM60-NEXT:    and.b32 %r6, %r5, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM60-NEXT:    mov.b32 %r17, %r6;
-; SM60-NEXT:    @%p2 bra $L__BB68_1;
-; SM60-NEXT:  $L__BB68_3: // %partword.cmpxchg.end
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM60-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM60-NEXT:    mov.b32 %r16, %r6;
+; SM60-NEXT:    @%p2 bra $L__BB0_1;
+; SM60-NEXT:  $L__BB0_3: // %partword.cmpxchg.end
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic
+    ret i8 %new
 }
 
-define i8 @strong_monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: strong_monotonic_acquire_i8_global_cta(
+define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_acquire_i8_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b32 %r<18>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [strong_monotonic_acquire_i8_global_cta_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_global_cta_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r7, [strong_monotonic_acquire_i8_global_cta_param_1];
-; SM60-NEXT:    ld.param.b8 %r8, [strong_monotonic_acquire_i8_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM60-NEXT:    ld.param.b8 %r7, [monotonic_acquire_i8_global_cta_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 255;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r17, %r13, %r2;
-; SM60-NEXT:  $L__BB69_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
+; SM60-NEXT:  $L__BB1_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r14, %r17, %r3;
-; SM60-NEXT:    or.b32 %r15, %r17, %r4;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM60-NEXT:    @%p1 bra $L__BB69_3;
+; SM60-NEXT:    @%p1 bra $L__BB1_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB69_1 Depth=1
+; SM60-NEXT:    // in Loop: Header=BB1_1 Depth=1
 ; SM60-NEXT:    and.b32 %r6, %r5, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM60-NEXT:    mov.b32 %r17, %r6;
-; SM60-NEXT:    @%p2 bra $L__BB69_1;
-; SM60-NEXT:  $L__BB69_3: // %partword.cmpxchg.end
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM60-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM60-NEXT:    mov.b32 %r16, %r6;
+; SM60-NEXT:    @%p2 bra $L__BB1_1;
+; SM60-NEXT:  $L__BB1_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire
+    ret i8 %new
 }
 
-define i8 @strong_monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: strong_monotonic_seq_cst_i8_global_cta(
+define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_seq_cst_i8_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b32 %r<18>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [strong_monotonic_seq_cst_i8_global_cta_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b8 %r7, [strong_monotonic_seq_cst_i8_global_cta_param_1];
+; SM60-NEXT:    ld.param.b8 %r7, [monotonic_seq_cst_i8_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r8, [strong_monotonic_seq_cst_i8_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 255;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r17, %r13, %r2;
-; SM60-NEXT:  $L__BB70_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
+; SM60-NEXT:  $L__BB2_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r14, %r17, %r3;
-; SM60-NEXT:    or.b32 %r15, %r17, %r4;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM60-NEXT:    @%p1 bra $L__BB70_3;
+; SM60-NEXT:    @%p1 bra $L__BB2_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB70_1 Depth=1
+; SM60-NEXT:    // in Loop: Header=BB2_1 Depth=1
 ; SM60-NEXT:    and.b32 %r6, %r5, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM60-NEXT:    mov.b32 %r17, %r6;
-; SM60-NEXT:    @%p2 bra $L__BB70_1;
-; SM60-NEXT:  $L__BB70_3: // %partword.cmpxchg.end
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM60-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM60-NEXT:    mov.b32 %r16, %r6;
+; SM60-NEXT:    @%p2 bra $L__BB2_1;
+; SM60-NEXT:  $L__BB2_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst
+    ret i8 %new
 }
 
-define i8 @strong_acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: strong_acquire_monotonic_i8_global_cta(
+define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_monotonic_i8_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b32 %r<18>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [strong_acquire_monotonic_i8_global_cta_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_global_cta_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r7, [strong_acquire_monotonic_i8_global_cta_param_1];
-; SM60-NEXT:    ld.param.b8 %r8, [strong_acquire_monotonic_i8_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM60-NEXT:    ld.param.b8 %r7, [acquire_monotonic_i8_global_cta_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 255;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r17, %r13, %r2;
-; SM60-NEXT:  $L__BB71_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
+; SM60-NEXT:  $L__BB3_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r14, %r17, %r3;
-; SM60-NEXT:    or.b32 %r15, %r17, %r4;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM60-NEXT:    @%p1 bra $L__BB71_3;
+; SM60-NEXT:    @%p1 bra $L__BB3_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB71_1 Depth=1
+; SM60-NEXT:    // in Loop: Header=BB3_1 Depth=1
 ; SM60-NEXT:    and.b32 %r6, %r5, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM60-NEXT:    mov.b32 %r17, %r6;
-; SM60-NEXT:    @%p2 bra $L__BB71_1;
-; SM60-NEXT:  $L__BB71_3: // %partword.cmpxchg.end
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM60-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM60-NEXT:    mov.b32 %r16, %r6;
+; SM60-NEXT:    @%p2 bra $L__BB3_1;
+; SM60-NEXT:  $L__BB3_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic
+    ret i8 %new
 }
 
-define i8 @strong_acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: strong_acquire_acquire_i8_global_cta(
+define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_acquire_i8_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b32 %r<18>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [strong_acquire_acquire_i8_global_cta_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_global_cta_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r7, [strong_acquire_acquire_i8_global_cta_param_1];
-; SM60-NEXT:    ld.param.b8 %r8, [strong_acquire_acquire_i8_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM60-NEXT:    ld.param.b8 %r7, [acquire_acquire_i8_global_cta_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 255;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r17, %r13, %r2;
-; SM60-NEXT:  $L__BB72_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
+; SM60-NEXT:  $L__BB4_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r14, %r17, %r3;
-; SM60-NEXT:    or.b32 %r15, %r17, %r4;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM60-NEXT:    @%p1 bra $L__BB72_3;
+; SM60-NEXT:    @%p1 bra $L__BB4_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB72_1 Depth=1
+; SM60-NEXT:    // in Loop: Header=BB4_1 Depth=1
 ; SM60-NEXT:    and.b32 %r6, %r5, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM60-NEXT:    mov.b32 %r17, %r6;
-; SM60-NEXT:    @%p2 bra $L__BB72_1;
-; SM60-NEXT:  $L__BB72_3: // %partword.cmpxchg.end
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM60-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM60-NEXT:    mov.b32 %r16, %r6;
+; SM60-NEXT:    @%p2 bra $L__BB4_1;
+; SM60-NEXT:  $L__BB4_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire
+    ret i8 %new
 }
 
-define i8 @strong_acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: strong_acquire_seq_cst_i8_global_cta(
+define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_seq_cst_i8_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b32 %r<18>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [strong_acquire_seq_cst_i8_global_cta_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b8 %r7, [strong_acquire_seq_cst_i8_global_cta_param_1];
+; SM60-NEXT:    ld.param.b8 %r7, [acquire_seq_cst_i8_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r8, [strong_acquire_seq_cst_i8_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 255;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r17, %r13, %r2;
-; SM60-NEXT:  $L__BB73_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
+; SM60-NEXT:  $L__BB5_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r14, %r17, %r3;
-; SM60-NEXT:    or.b32 %r15, %r17, %r4;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM60-NEXT:    @%p1 bra $L__BB73_3;
+; SM60-NEXT:    @%p1 bra $L__BB5_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB73_1 Depth=1
+; SM60-NEXT:    // in Loop: Header=BB5_1 Depth=1
 ; SM60-NEXT:    and.b32 %r6, %r5, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM60-NEXT:    mov.b32 %r17, %r6;
-; SM60-NEXT:    @%p2 bra $L__BB73_1;
-; SM60-NEXT:  $L__BB73_3: // %partword.cmpxchg.end
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM60-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM60-NEXT:    mov.b32 %r16, %r6;
+; SM60-NEXT:    @%p2 bra $L__BB5_1;
+; SM60-NEXT:  $L__BB5_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst
+    ret i8 %new
 }
 
-define i8 @strong_release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: strong_release_monotonic_i8_global_cta(
+define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_monotonic_i8_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b32 %r<18>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [strong_release_monotonic_i8_global_cta_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b8 %r7, [strong_release_monotonic_i8_global_cta_param_1];
+; SM60-NEXT:    ld.param.b8 %r7, [release_monotonic_i8_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r8, [strong_release_monotonic_i8_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 255;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r17, %r13, %r2;
-; SM60-NEXT:  $L__BB74_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
+; SM60-NEXT:  $L__BB6_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r14, %r17, %r3;
-; SM60-NEXT:    or.b32 %r15, %r17, %r4;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM60-NEXT:    @%p1 bra $L__BB74_3;
+; SM60-NEXT:    @%p1 bra $L__BB6_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB74_1 Depth=1
+; SM60-NEXT:    // in Loop: Header=BB6_1 Depth=1
 ; SM60-NEXT:    and.b32 %r6, %r5, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM60-NEXT:    mov.b32 %r17, %r6;
-; SM60-NEXT:    @%p2 bra $L__BB74_1;
-; SM60-NEXT:  $L__BB74_3: // %partword.cmpxchg.end
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM60-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM60-NEXT:    mov.b32 %r16, %r6;
+; SM60-NEXT:    @%p2 bra $L__BB6_1;
+; SM60-NEXT:  $L__BB6_3: // %partword.cmpxchg.end
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic
+    ret i8 %new
 }
 
-define i8 @strong_release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: strong_release_acquire_i8_global_cta(
+define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_acquire_i8_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b32 %r<18>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [strong_release_acquire_i8_global_cta_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b8 %r7, [strong_release_acquire_i8_global_cta_param_1];
+; SM60-NEXT:    ld.param.b8 %r7, [release_acquire_i8_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r8, [strong_release_acquire_i8_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 255;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r17, %r13, %r2;
-; SM60-NEXT:  $L__BB75_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
+; SM60-NEXT:  $L__BB7_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r14, %r17, %r3;
-; SM60-NEXT:    or.b32 %r15, %r17, %r4;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM60-NEXT:    @%p1 bra $L__BB75_3;
+; SM60-NEXT:    @%p1 bra $L__BB7_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB75_1 Depth=1
+; SM60-NEXT:    // in Loop: Header=BB7_1 Depth=1
 ; SM60-NEXT:    and.b32 %r6, %r5, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM60-NEXT:    mov.b32 %r17, %r6;
-; SM60-NEXT:    @%p2 bra $L__BB75_1;
-; SM60-NEXT:  $L__BB75_3: // %partword.cmpxchg.end
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM60-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM60-NEXT:    mov.b32 %r16, %r6;
+; SM60-NEXT:    @%p2 bra $L__BB7_1;
+; SM60-NEXT:  $L__BB7_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire
+    ret i8 %new
 }
 
-define i8 @strong_release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: strong_release_seq_cst_i8_global_cta(
+define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_seq_cst_i8_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b32 %r<18>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [strong_release_seq_cst_i8_global_cta_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b8 %r7, [strong_release_seq_cst_i8_global_cta_param_1];
+; SM60-NEXT:    ld.param.b8 %r7, [release_seq_cst_i8_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r8, [strong_release_seq_cst_i8_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 255;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r17, %r13, %r2;
-; SM60-NEXT:  $L__BB76_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
+; SM60-NEXT:  $L__BB8_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r14, %r17, %r3;
-; SM60-NEXT:    or.b32 %r15, %r17, %r4;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM60-NEXT:    @%p1 bra $L__BB76_3;
+; SM60-NEXT:    @%p1 bra $L__BB8_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB76_1 Depth=1
+; SM60-NEXT:    // in Loop: Header=BB8_1 Depth=1
 ; SM60-NEXT:    and.b32 %r6, %r5, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM60-NEXT:    mov.b32 %r17, %r6;
-; SM60-NEXT:    @%p2 bra $L__BB76_1;
-; SM60-NEXT:  $L__BB76_3: // %partword.cmpxchg.end
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM60-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM60-NEXT:    mov.b32 %r16, %r6;
+; SM60-NEXT:    @%p2 bra $L__BB8_1;
+; SM60-NEXT:  $L__BB8_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst
+    ret i8 %new
 }
 
-define i8 @strong_acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: strong_acq_rel_monotonic_i8_global_cta(
+define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_monotonic_i8_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b32 %r<18>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [strong_acq_rel_monotonic_i8_global_cta_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b8 %r7, [strong_acq_rel_monotonic_i8_global_cta_param_1];
+; SM60-NEXT:    ld.param.b8 %r7, [acq_rel_monotonic_i8_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r8, [strong_acq_rel_monotonic_i8_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 255;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r17, %r13, %r2;
-; SM60-NEXT:  $L__BB77_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
+; SM60-NEXT:  $L__BB9_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r14, %r17, %r3;
-; SM60-NEXT:    or.b32 %r15, %r17, %r4;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM60-NEXT:    @%p1 bra $L__BB77_3;
+; SM60-NEXT:    @%p1 bra $L__BB9_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB77_1 Depth=1
+; SM60-NEXT:    // in Loop: Header=BB9_1 Depth=1
 ; SM60-NEXT:    and.b32 %r6, %r5, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM60-NEXT:    mov.b32 %r17, %r6;
-; SM60-NEXT:    @%p2 bra $L__BB77_1;
-; SM60-NEXT:  $L__BB77_3: // %partword.cmpxchg.end
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM60-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM60-NEXT:    mov.b32 %r16, %r6;
+; SM60-NEXT:    @%p2 bra $L__BB9_1;
+; SM60-NEXT:  $L__BB9_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic
+    ret i8 %new
 }
 
-define i8 @strong_acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: strong_acq_rel_acquire_i8_global_cta(
+define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_acquire_i8_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b32 %r<18>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [strong_acq_rel_acquire_i8_global_cta_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b8 %r7, [strong_acq_rel_acquire_i8_global_cta_param_1];
+; SM60-NEXT:    ld.param.b8 %r7, [acq_rel_acquire_i8_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r8, [strong_acq_rel_acquire_i8_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 255;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r17, %r13, %r2;
-; SM60-NEXT:  $L__BB78_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
+; SM60-NEXT:  $L__BB10_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r14, %r17, %r3;
-; SM60-NEXT:    or.b32 %r15, %r17, %r4;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM60-NEXT:    @%p1 bra $L__BB78_3;
+; SM60-NEXT:    @%p1 bra $L__BB10_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB78_1 Depth=1
+; SM60-NEXT:    // in Loop: Header=BB10_1 Depth=1
 ; SM60-NEXT:    and.b32 %r6, %r5, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM60-NEXT:    mov.b32 %r17, %r6;
-; SM60-NEXT:    @%p2 bra $L__BB78_1;
-; SM60-NEXT:  $L__BB78_3: // %partword.cmpxchg.end
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM60-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM60-NEXT:    mov.b32 %r16, %r6;
+; SM60-NEXT:    @%p2 bra $L__BB10_1;
+; SM60-NEXT:  $L__BB10_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
+    ret i8 %new
 }
 
-define i8 @strong_acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: strong_acq_rel_seq_cst_i8_global_cta(
+define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i8_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b32 %r<18>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [strong_acq_rel_seq_cst_i8_global_cta_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b8 %r7, [strong_acq_rel_seq_cst_i8_global_cta_param_1];
+; SM60-NEXT:    ld.param.b8 %r7, [acq_rel_seq_cst_i8_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r8, [strong_acq_rel_seq_cst_i8_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 255;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r17, %r13, %r2;
-; SM60-NEXT:  $L__BB79_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
+; SM60-NEXT:  $L__BB11_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r14, %r17, %r3;
-; SM60-NEXT:    or.b32 %r15, %r17, %r4;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM60-NEXT:    @%p1 bra $L__BB79_3;
+; SM60-NEXT:    @%p1 bra $L__BB11_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB79_1 Depth=1
+; SM60-NEXT:    // in Loop: Header=BB11_1 Depth=1
 ; SM60-NEXT:    and.b32 %r6, %r5, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM60-NEXT:    mov.b32 %r17, %r6;
-; SM60-NEXT:    @%p2 bra $L__BB79_1;
-; SM60-NEXT:  $L__BB79_3: // %partword.cmpxchg.end
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM60-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM60-NEXT:    mov.b32 %r16, %r6;
+; SM60-NEXT:    @%p2 bra $L__BB11_1;
+; SM60-NEXT:  $L__BB11_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst
+    ret i8 %new
 }
 
-define i8 @strong_seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: strong_seq_cst_monotonic_i8_global_cta(
+define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_monotonic_i8_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b32 %r<18>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [strong_seq_cst_monotonic_i8_global_cta_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b8 %r7, [strong_seq_cst_monotonic_i8_global_cta_param_1];
+; SM60-NEXT:    ld.param.b8 %r7, [seq_cst_monotonic_i8_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r8, [strong_seq_cst_monotonic_i8_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 255;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r17, %r13, %r2;
-; SM60-NEXT:  $L__BB80_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
+; SM60-NEXT:  $L__BB12_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r14, %r17, %r3;
-; SM60-NEXT:    or.b32 %r15, %r17, %r4;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM60-NEXT:    @%p1 bra $L__BB80_3;
+; SM60-NEXT:    @%p1 bra $L__BB12_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB80_1 Depth=1
+; SM60-NEXT:    // in Loop: Header=BB12_1 Depth=1
 ; SM60-NEXT:    and.b32 %r6, %r5, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM60-NEXT:    mov.b32 %r17, %r6;
-; SM60-NEXT:    @%p2 bra $L__BB80_1;
-; SM60-NEXT:  $L__BB80_3: // %partword.cmpxchg.end
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM60-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM60-NEXT:    mov.b32 %r16, %r6;
+; SM60-NEXT:    @%p2 bra $L__BB12_1;
+; SM60-NEXT:  $L__BB12_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic
+    ret i8 %new
 }
 
-define i8 @strong_seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: strong_seq_cst_acquire_i8_global_cta(
+define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_acquire_i8_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b32 %r<18>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [strong_seq_cst_acquire_i8_global_cta_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b8 %r7, [strong_seq_cst_acquire_i8_global_cta_param_1];
+; SM60-NEXT:    ld.param.b8 %r7, [seq_cst_acquire_i8_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r8, [strong_seq_cst_acquire_i8_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 255;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r17, %r13, %r2;
-; SM60-NEXT:  $L__BB81_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
+; SM60-NEXT:  $L__BB13_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r14, %r17, %r3;
-; SM60-NEXT:    or.b32 %r15, %r17, %r4;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM60-NEXT:    @%p1 bra $L__BB81_3;
+; SM60-NEXT:    @%p1 bra $L__BB13_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB81_1 Depth=1
+; SM60-NEXT:    // in Loop: Header=BB13_1 Depth=1
 ; SM60-NEXT:    and.b32 %r6, %r5, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM60-NEXT:    mov.b32 %r17, %r6;
-; SM60-NEXT:    @%p2 bra $L__BB81_1;
-; SM60-NEXT:  $L__BB81_3: // %partword.cmpxchg.end
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM60-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM60-NEXT:    mov.b32 %r16, %r6;
+; SM60-NEXT:    @%p2 bra $L__BB13_1;
+; SM60-NEXT:  $L__BB13_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire
+    ret i8 %new
 }
 
-define i8 @strong_seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: strong_seq_cst_seq_cst_i8_global_cta(
+define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i8_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b32 %r<18>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [strong_seq_cst_seq_cst_i8_global_cta_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b8 %r7, [strong_seq_cst_seq_cst_i8_global_cta_param_1];
+; SM60-NEXT:    ld.param.b8 %r7, [seq_cst_seq_cst_i8_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r8, [strong_seq_cst_seq_cst_i8_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 255;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r17, %r13, %r2;
-; SM60-NEXT:  $L__BB82_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
+; SM60-NEXT:  $L__BB14_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r14, %r17, %r3;
-; SM60-NEXT:    or.b32 %r15, %r17, %r4;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM60-NEXT:    @%p1 bra $L__BB82_3;
+; SM60-NEXT:    @%p1 bra $L__BB14_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB82_1 Depth=1
+; SM60-NEXT:    // in Loop: Header=BB14_1 Depth=1
 ; SM60-NEXT:    and.b32 %r6, %r5, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM60-NEXT:    mov.b32 %r17, %r6;
-; SM60-NEXT:    @%p2 bra $L__BB82_1;
-; SM60-NEXT:  $L__BB82_3: // %partword.cmpxchg.end
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM60-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM60-NEXT:    mov.b32 %r16, %r6;
+; SM60-NEXT:    @%p2 bra $L__BB14_1;
+; SM60-NEXT:  $L__BB14_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst
+    ret i8 %new
 }
 
-define i16 @strong_monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: strong_monotonic_monotonic_i16_global_cta(
+define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_monotonic_i16_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b32 %r<18>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [strong_monotonic_monotonic_i16_global_cta_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_global_cta_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r7, [strong_monotonic_monotonic_i16_global_cta_param_1];
-; SM60-NEXT:    ld.param.b16 %r8, [strong_monotonic_monotonic_i16_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 65535;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM60-NEXT:    ld.param.b16 %r7, [monotonic_monotonic_i16_global_cta_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 65535;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r17, %r13, %r2;
-; SM60-NEXT:  $L__BB83_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
+; SM60-NEXT:  $L__BB15_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r14, %r17, %r3;
-; SM60-NEXT:    or.b32 %r15, %r17, %r4;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM60-NEXT:    @%p1 bra $L__BB83_3;
+; SM60-NEXT:    @%p1 bra $L__BB15_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB83_1 Depth=1
+; SM60-NEXT:    // in Loop: Header=BB15_1 Depth=1
 ; SM60-NEXT:    and.b32 %r6, %r5, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM60-NEXT:    mov.b32 %r17, %r6;
-; SM60-NEXT:    @%p2 bra $L__BB83_1;
-; SM60-NEXT:  $L__BB83_3: // %partword.cmpxchg.end
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM60-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM60-NEXT:    mov.b32 %r16, %r6;
+; SM60-NEXT:    @%p2 bra $L__BB15_1;
+; SM60-NEXT:  $L__BB15_3: // %partword.cmpxchg.end
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic
+    ret i16 %new
 }
 
-define i16 @strong_monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: strong_monotonic_acquire_i16_global_cta(
+define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_acquire_i16_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b32 %r<18>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [strong_monotonic_acquire_i16_global_cta_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_global_cta_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r7, [strong_monotonic_acquire_i16_global_cta_param_1];
-; SM60-NEXT:    ld.param.b16 %r8, [strong_monotonic_acquire_i16_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 65535;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM60-NEXT:    ld.param.b16 %r7, [monotonic_acquire_i16_global_cta_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 65535;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r17, %r13, %r2;
-; SM60-NEXT:  $L__BB84_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
+; SM60-NEXT:  $L__BB16_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r14, %r17, %r3;
-; SM60-NEXT:    or.b32 %r15, %r17, %r4;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM60-NEXT:    @%p1 bra $L__BB84_3;
+; SM60-NEXT:    @%p1 bra $L__BB16_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB84_1 Depth=1
+; SM60-NEXT:    // in Loop: Header=BB16_1 Depth=1
 ; SM60-NEXT:    and.b32 %r6, %r5, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM60-NEXT:    mov.b32 %r17, %r6;
-; SM60-NEXT:    @%p2 bra $L__BB84_1;
-; SM60-NEXT:  $L__BB84_3: // %partword.cmpxchg.end
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM60-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM60-NEXT:    mov.b32 %r16, %r6;
+; SM60-NEXT:    @%p2 bra $L__BB16_1;
+; SM60-NEXT:  $L__BB16_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire
+    ret i16 %new
 }
 
-define i16 @strong_monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: strong_monotonic_seq_cst_i16_global_cta(
+define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_seq_cst_i16_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b32 %r<18>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [strong_monotonic_seq_cst_i16_global_cta_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b16 %r7, [strong_monotonic_seq_cst_i16_global_cta_param_1];
+; SM60-NEXT:    ld.param.b16 %r7, [monotonic_seq_cst_i16_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r8, [strong_monotonic_seq_cst_i16_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 65535;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 65535;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r17, %r13, %r2;
-; SM60-NEXT:  $L__BB85_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
+; SM60-NEXT:  $L__BB17_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r14, %r17, %r3;
-; SM60-NEXT:    or.b32 %r15, %r17, %r4;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM60-NEXT:    @%p1 bra $L__BB85_3;
+; SM60-NEXT:    @%p1 bra $L__BB17_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB85_1 Depth=1
+; SM60-NEXT:    // in Loop: Header=BB17_1 Depth=1
 ; SM60-NEXT:    and.b32 %r6, %r5, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM60-NEXT:    mov.b32 %r17, %r6;
-; SM60-NEXT:    @%p2 bra $L__BB85_1;
-; SM60-NEXT:  $L__BB85_3: // %partword.cmpxchg.end
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM60-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM60-NEXT:    mov.b32 %r16, %r6;
+; SM60-NEXT:    @%p2 bra $L__BB17_1;
+; SM60-NEXT:  $L__BB17_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst
+    ret i16 %new
 }
 
-define i16 @strong_acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: strong_acquire_monotonic_i16_global_cta(
+define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_monotonic_i16_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b32 %r<18>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [strong_acquire_monotonic_i16_global_cta_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_global_cta_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r7, [strong_acquire_monotonic_i16_global_cta_param_1];
-; SM60-NEXT:    ld.param.b16 %r8, [strong_acquire_monotonic_i16_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 65535;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM60-NEXT:    ld.param.b16 %r7, [acquire_monotonic_i16_global_cta_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 65535;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r17, %r13, %r2;
-; SM60-NEXT:  $L__BB86_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
+; SM60-NEXT:  $L__BB18_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r14, %r17, %r3;
-; SM60-NEXT:    or.b32 %r15, %r17, %r4;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM60-NEXT:    @%p1 bra $L__BB86_3;
+; SM60-NEXT:    @%p1 bra $L__BB18_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB86_1 Depth=1
+; SM60-NEXT:    // in Loop: Header=BB18_1 Depth=1
 ; SM60-NEXT:    and.b32 %r6, %r5, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM60-NEXT:    mov.b32 %r17, %r6;
-; SM60-NEXT:    @%p2 bra $L__BB86_1;
-; SM60-NEXT:  $L__BB86_3: // %partword.cmpxchg.end
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM60-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM60-NEXT:    mov.b32 %r16, %r6;
+; SM60-NEXT:    @%p2 bra $L__BB18_1;
+; SM60-NEXT:  $L__BB18_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic
+    ret i16 %new
 }
 
-define i16 @strong_acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: strong_acquire_acquire_i16_global_cta(
+define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_acquire_i16_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b32 %r<18>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [strong_acquire_acquire_i16_global_cta_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_global_cta_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r7, [strong_acquire_acquire_i16_global_cta_param_1];
-; SM60-NEXT:    ld.param.b16 %r8, [strong_acquire_acquire_i16_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 65535;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM60-NEXT:    ld.param.b16 %r7, [acquire_acquire_i16_global_cta_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 65535;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r17, %r13, %r2;
-; SM60-NEXT:  $L__BB87_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
+; SM60-NEXT:  $L__BB19_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r14, %r17, %r3;
-; SM60-NEXT:    or.b32 %r15, %r17, %r4;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM60-NEXT:    @%p1 bra $L__BB87_3;
+; SM60-NEXT:    @%p1 bra $L__BB19_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB87_1 Depth=1
+; SM60-NEXT:    // in Loop: Header=BB19_1 Depth=1
 ; SM60-NEXT:    and.b32 %r6, %r5, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM60-NEXT:    mov.b32 %r17, %r6;
-; SM60-NEXT:    @%p2 bra $L__BB87_1;
-; SM60-NEXT:  $L__BB87_3: // %partword.cmpxchg.end
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM60-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM60-NEXT:    mov.b32 %r16, %r6;
+; SM60-NEXT:    @%p2 bra $L__BB19_1;
+; SM60-NEXT:  $L__BB19_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire
+    ret i16 %new
 }
 
-define i16 @strong_acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: strong_acquire_seq_cst_i16_global_cta(
+define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_seq_cst_i16_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b32 %r<18>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [strong_acquire_seq_cst_i16_global_cta_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b16 %r7, [strong_acquire_seq_cst_i16_global_cta_param_1];
+; SM60-NEXT:    ld.param.b16 %r7, [acquire_seq_cst_i16_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r8, [strong_acquire_seq_cst_i16_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 65535;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 65535;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r17, %r13, %r2;
-; SM60-NEXT:  $L__BB88_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
+; SM60-NEXT:  $L__BB20_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r14, %r17, %r3;
-; SM60-NEXT:    or.b32 %r15, %r17, %r4;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM60-NEXT:    @%p1 bra $L__BB88_3;
+; SM60-NEXT:    @%p1 bra $L__BB20_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB88_1 Depth=1
+; SM60-NEXT:    // in Loop: Header=BB20_1 Depth=1
 ; SM60-NEXT:    and.b32 %r6, %r5, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM60-NEXT:    mov.b32 %r17, %r6;
-; SM60-NEXT:    @%p2 bra $L__BB88_1;
-; SM60-NEXT:  $L__BB88_3: // %partword.cmpxchg.end
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM60-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM60-NEXT:    mov.b32 %r16, %r6;
+; SM60-NEXT:    @%p2 bra $L__BB20_1;
+; SM60-NEXT:  $L__BB20_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst
+    ret i16 %new
 }
 
-define i16 @strong_release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: strong_release_monotonic_i16_global_cta(
+define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_monotonic_i16_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b32 %r<18>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [strong_release_monotonic_i16_global_cta_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b16 %r7, [strong_release_monotonic_i16_global_cta_param_1];
+; SM60-NEXT:    ld.param.b16 %r7, [release_monotonic_i16_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r8, [strong_release_monotonic_i16_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 65535;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 65535;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r17, %r13, %r2;
-; SM60-NEXT:  $L__BB89_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
+; SM60-NEXT:  $L__BB21_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r14, %r17, %r3;
-; SM60-NEXT:    or.b32 %r15, %r17, %r4;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM60-NEXT:    @%p1 bra $L__BB89_3;
+; SM60-NEXT:    @%p1 bra $L__BB21_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB89_1 Depth=1
+; SM60-NEXT:    // in Loop: Header=BB21_1 Depth=1
 ; SM60-NEXT:    and.b32 %r6, %r5, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM60-NEXT:    mov.b32 %r17, %r6;
-; SM60-NEXT:    @%p2 bra $L__BB89_1;
-; SM60-NEXT:  $L__BB89_3: // %partword.cmpxchg.end
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM60-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM60-NEXT:    mov.b32 %r16, %r6;
+; SM60-NEXT:    @%p2 bra $L__BB21_1;
+; SM60-NEXT:  $L__BB21_3: // %partword.cmpxchg.end
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic
+    ret i16 %new
 }
 
-define i16 @strong_release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: strong_release_acquire_i16_global_cta(
+define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_acquire_i16_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b32 %r<18>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [strong_release_acquire_i16_global_cta_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b16 %r7, [strong_release_acquire_i16_global_cta_param_1];
+; SM60-NEXT:    ld.param.b16 %r7, [release_acquire_i16_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r8, [strong_release_acquire_i16_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 65535;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 65535;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r17, %r13, %r2;
-; SM60-NEXT:  $L__BB90_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
+; SM60-NEXT:  $L__BB22_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r14, %r17, %r3;
-; SM60-NEXT:    or.b32 %r15, %r17, %r4;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM60-NEXT:    @%p1 bra $L__BB90_3;
+; SM60-NEXT:    @%p1 bra $L__BB22_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB90_1 Depth=1
+; SM60-NEXT:    // in Loop: Header=BB22_1 Depth=1
 ; SM60-NEXT:    and.b32 %r6, %r5, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM60-NEXT:    mov.b32 %r17, %r6;
-; SM60-NEXT:    @%p2 bra $L__BB90_1;
-; SM60-NEXT:  $L__BB90_3: // %partword.cmpxchg.end
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM60-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM60-NEXT:    mov.b32 %r16, %r6;
+; SM60-NEXT:    @%p2 bra $L__BB22_1;
+; SM60-NEXT:  $L__BB22_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release acquire
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release acquire
+    ret i16 %new
 }
 
-define i16 @strong_release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: strong_release_seq_cst_i16_global_cta(
+define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_seq_cst_i16_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b32 %r<18>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [strong_release_seq_cst_i16_global_cta_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b16 %r7, [strong_release_seq_cst_i16_global_cta_param_1];
+; SM60-NEXT:    ld.param.b16 %r7, [release_seq_cst_i16_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r8, [strong_release_seq_cst_i16_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 65535;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 65535;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r17, %r13, %r2;
-; SM60-NEXT:  $L__BB91_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
+; SM60-NEXT:  $L__BB23_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r14, %r17, %r3;
-; SM60-NEXT:    or.b32 %r15, %r17, %r4;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM60-NEXT:    @%p1 bra $L__BB91_3;
+; SM60-NEXT:    @%p1 bra $L__BB23_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB91_1 Depth=1
+; SM60-NEXT:    // in Loop: Header=BB23_1 Depth=1
 ; SM60-NEXT:    and.b32 %r6, %r5, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM60-NEXT:    mov.b32 %r17, %r6;
-; SM60-NEXT:    @%p2 bra $L__BB91_1;
-; SM60-NEXT:  $L__BB91_3: // %partword.cmpxchg.end
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM60-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM60-NEXT:    mov.b32 %r16, %r6;
+; SM60-NEXT:    @%p2 bra $L__BB23_1;
+; SM60-NEXT:  $L__BB23_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst
+    ret i16 %new
 }
 
-define i16 @strong_acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: strong_acq_rel_monotonic_i16_global_cta(
+define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_monotonic_i16_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b32 %r<18>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [strong_acq_rel_monotonic_i16_global_cta_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b16 %r7, [strong_acq_rel_monotonic_i16_global_cta_param_1];
+; SM60-NEXT:    ld.param.b16 %r7, [acq_rel_monotonic_i16_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r8, [strong_acq_rel_monotonic_i16_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 65535;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 65535;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r17, %r13, %r2;
-; SM60-NEXT:  $L__BB92_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
+; SM60-NEXT:  $L__BB24_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r14, %r17, %r3;
-; SM60-NEXT:    or.b32 %r15, %r17, %r4;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM60-NEXT:    @%p1 bra $L__BB92_3;
+; SM60-NEXT:    @%p1 bra $L__BB24_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB92_1 Depth=1
+; SM60-NEXT:    // in Loop: Header=BB24_1 Depth=1
 ; SM60-NEXT:    and.b32 %r6, %r5, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM60-NEXT:    mov.b32 %r17, %r6;
-; SM60-NEXT:    @%p2 bra $L__BB92_1;
-; SM60-NEXT:  $L__BB92_3: // %partword.cmpxchg.end
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM60-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM60-NEXT:    mov.b32 %r16, %r6;
+; SM60-NEXT:    @%p2 bra $L__BB24_1;
+; SM60-NEXT:  $L__BB24_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic
+    ret i16 %new
 }
 
-define i16 @strong_acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: strong_acq_rel_acquire_i16_global_cta(
+define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_acquire_i16_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b32 %r<18>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [strong_acq_rel_acquire_i16_global_cta_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b16 %r7, [strong_acq_rel_acquire_i16_global_cta_param_1];
+; SM60-NEXT:    ld.param.b16 %r7, [acq_rel_acquire_i16_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r8, [strong_acq_rel_acquire_i16_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 65535;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 65535;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r17, %r13, %r2;
-; SM60-NEXT:  $L__BB93_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
+; SM60-NEXT:  $L__BB25_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r14, %r17, %r3;
-; SM60-NEXT:    or.b32 %r15, %r17, %r4;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM60-NEXT:    @%p1 bra $L__BB93_3;
+; SM60-NEXT:    @%p1 bra $L__BB25_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB93_1 Depth=1
+; SM60-NEXT:    // in Loop: Header=BB25_1 Depth=1
 ; SM60-NEXT:    and.b32 %r6, %r5, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM60-NEXT:    mov.b32 %r17, %r6;
-; SM60-NEXT:    @%p2 bra $L__BB93_1;
-; SM60-NEXT:  $L__BB93_3: // %partword.cmpxchg.end
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM60-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM60-NEXT:    mov.b32 %r16, %r6;
+; SM60-NEXT:    @%p2 bra $L__BB25_1;
+; SM60-NEXT:  $L__BB25_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire
+    ret i16 %new
 }
 
-define i16 @strong_acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: strong_acq_rel_seq_cst_i16_global_cta(
+define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i16_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b32 %r<18>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [strong_acq_rel_seq_cst_i16_global_cta_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b16 %r7, [strong_acq_rel_seq_cst_i16_global_cta_param_1];
+; SM60-NEXT:    ld.param.b16 %r7, [acq_rel_seq_cst_i16_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r8, [strong_acq_rel_seq_cst_i16_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 65535;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 65535;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r17, %r13, %r2;
-; SM60-NEXT:  $L__BB94_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
+; SM60-NEXT:  $L__BB26_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r14, %r17, %r3;
-; SM60-NEXT:    or.b32 %r15, %r17, %r4;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM60-NEXT:    @%p1 bra $L__BB94_3;
+; SM60-NEXT:    @%p1 bra $L__BB26_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB94_1 Depth=1
+; SM60-NEXT:    // in Loop: Header=BB26_1 Depth=1
 ; SM60-NEXT:    and.b32 %r6, %r5, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM60-NEXT:    mov.b32 %r17, %r6;
-; SM60-NEXT:    @%p2 bra $L__BB94_1;
-; SM60-NEXT:  $L__BB94_3: // %partword.cmpxchg.end
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM60-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM60-NEXT:    mov.b32 %r16, %r6;
+; SM60-NEXT:    @%p2 bra $L__BB26_1;
+; SM60-NEXT:  $L__BB26_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst
+    ret i16 %new
 }
 
-define i16 @strong_seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: strong_seq_cst_monotonic_i16_global_cta(
+define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_monotonic_i16_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b32 %r<18>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [strong_seq_cst_monotonic_i16_global_cta_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b16 %r7, [strong_seq_cst_monotonic_i16_global_cta_param_1];
+; SM60-NEXT:    ld.param.b16 %r7, [seq_cst_monotonic_i16_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r8, [strong_seq_cst_monotonic_i16_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 65535;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 65535;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r17, %r13, %r2;
-; SM60-NEXT:  $L__BB95_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
+; SM60-NEXT:  $L__BB27_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r14, %r17, %r3;
-; SM60-NEXT:    or.b32 %r15, %r17, %r4;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM60-NEXT:    @%p1 bra $L__BB95_3;
+; SM60-NEXT:    @%p1 bra $L__BB27_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB95_1 Depth=1
+; SM60-NEXT:    // in Loop: Header=BB27_1 Depth=1
 ; SM60-NEXT:    and.b32 %r6, %r5, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM60-NEXT:    mov.b32 %r17, %r6;
-; SM60-NEXT:    @%p2 bra $L__BB95_1;
-; SM60-NEXT:  $L__BB95_3: // %partword.cmpxchg.end
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM60-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM60-NEXT:    mov.b32 %r16, %r6;
+; SM60-NEXT:    @%p2 bra $L__BB27_1;
+; SM60-NEXT:  $L__BB27_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic
+    ret i16 %new
 }
 
-define i16 @strong_seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: strong_seq_cst_acquire_i16_global_cta(
+define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_acquire_i16_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b32 %r<18>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [strong_seq_cst_acquire_i16_global_cta_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b16 %r7, [strong_seq_cst_acquire_i16_global_cta_param_1];
+; SM60-NEXT:    ld.param.b16 %r7, [seq_cst_acquire_i16_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r8, [strong_seq_cst_acquire_i16_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 65535;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 65535;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r17, %r13, %r2;
-; SM60-NEXT:  $L__BB96_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
+; SM60-NEXT:  $L__BB28_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r14, %r17, %r3;
-; SM60-NEXT:    or.b32 %r15, %r17, %r4;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM60-NEXT:    @%p1 bra $L__BB96_3;
+; SM60-NEXT:    @%p1 bra $L__BB28_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB96_1 Depth=1
+; SM60-NEXT:    // in Loop: Header=BB28_1 Depth=1
 ; SM60-NEXT:    and.b32 %r6, %r5, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM60-NEXT:    mov.b32 %r17, %r6;
-; SM60-NEXT:    @%p2 bra $L__BB96_1;
-; SM60-NEXT:  $L__BB96_3: // %partword.cmpxchg.end
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM60-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM60-NEXT:    mov.b32 %r16, %r6;
+; SM60-NEXT:    @%p2 bra $L__BB28_1;
+; SM60-NEXT:  $L__BB28_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire
+    ret i16 %new
 }
 
-define i16 @strong_seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: strong_seq_cst_seq_cst_i16_global_cta(
+define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i16_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b32 %r<18>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [strong_seq_cst_seq_cst_i16_global_cta_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b16 %r7, [strong_seq_cst_seq_cst_i16_global_cta_param_1];
+; SM60-NEXT:    ld.param.b16 %r7, [seq_cst_seq_cst_i16_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r8, [strong_seq_cst_seq_cst_i16_global_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 65535;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 65535;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r17, %r13, %r2;
-; SM60-NEXT:  $L__BB97_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
+; SM60-NEXT:  $L__BB29_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r14, %r17, %r3;
-; SM60-NEXT:    or.b32 %r15, %r17, %r4;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM60-NEXT:    @%p1 bra $L__BB97_3;
+; SM60-NEXT:    @%p1 bra $L__BB29_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB97_1 Depth=1
+; SM60-NEXT:    // in Loop: Header=BB29_1 Depth=1
 ; SM60-NEXT:    and.b32 %r6, %r5, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM60-NEXT:    mov.b32 %r17, %r6;
-; SM60-NEXT:    @%p2 bra $L__BB97_1;
-; SM60-NEXT:  $L__BB97_3: // %partword.cmpxchg.end
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM60-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM60-NEXT:    mov.b32 %r16, %r6;
+; SM60-NEXT:    @%p2 bra $L__BB29_1;
+; SM60-NEXT:  $L__BB29_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst
+    ret i16 %new
 }
 
-define i32 @strong_monotonic_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: strong_monotonic_monotonic_i32_global_cta(
+define i32 @monotonic_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_monotonic_i32_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b32 %r<4>;
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [strong_monotonic_monotonic_i32_global_cta_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [strong_monotonic_monotonic_i32_global_cta_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [strong_monotonic_monotonic_i32_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_global_cta_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_global_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_global_cta_param_2];
 ; SM60-NEXT:    atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic
+    ret i32 %new
 }
 
-define i32 @strong_monotonic_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: strong_monotonic_acquire_i32_global_cta(
+define i32 @monotonic_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_acquire_i32_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b32 %r<4>;
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [strong_monotonic_acquire_i32_global_cta_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [strong_monotonic_acquire_i32_global_cta_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [strong_monotonic_acquire_i32_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_global_cta_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_global_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_global_cta_param_2];
 ; SM60-NEXT:    atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire
+    ret i32 %new
 }
 
-define i32 @strong_monotonic_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: strong_monotonic_seq_cst_i32_global_cta(
+define i32 @monotonic_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_seq_cst_i32_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b32 %r<4>;
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [strong_monotonic_seq_cst_i32_global_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b32 %r1, [strong_monotonic_seq_cst_i32_global_cta_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [strong_monotonic_seq_cst_i32_global_cta_param_2];
+; SM60-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_global_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_global_cta_param_2];
 ; SM60-NEXT:    atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst
+    ret i32 %new
 }
 
-define i32 @strong_acquire_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: strong_acquire_monotonic_i32_global_cta(
+define i32 @acquire_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_monotonic_i32_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b32 %r<4>;
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [strong_acquire_monotonic_i32_global_cta_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [strong_acquire_monotonic_i32_global_cta_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [strong_acquire_monotonic_i32_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_global_cta_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_global_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_global_cta_param_2];
 ; SM60-NEXT:    atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic
+    ret i32 %new
 }
 
-define i32 @strong_acquire_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: strong_acquire_acquire_i32_global_cta(
+define i32 @acquire_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_acquire_i32_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b32 %r<4>;
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [strong_acquire_acquire_i32_global_cta_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [strong_acquire_acquire_i32_global_cta_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [strong_acquire_acquire_i32_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_global_cta_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_global_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_global_cta_param_2];
 ; SM60-NEXT:    atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire
+    ret i32 %new
 }
 
-define i32 @strong_acquire_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: strong_acquire_seq_cst_i32_global_cta(
+define i32 @acquire_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_seq_cst_i32_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b32 %r<4>;
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [strong_acquire_seq_cst_i32_global_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b32 %r1, [strong_acquire_seq_cst_i32_global_cta_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [strong_acquire_seq_cst_i32_global_cta_param_2];
+; SM60-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_global_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_global_cta_param_2];
 ; SM60-NEXT:    atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst
+    ret i32 %new
 }
 
-define i32 @strong_release_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: strong_release_monotonic_i32_global_cta(
+define i32 @release_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_monotonic_i32_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b32 %r<4>;
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [strong_release_monotonic_i32_global_cta_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [strong_release_monotonic_i32_global_cta_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [strong_release_monotonic_i32_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_global_cta_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_global_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_global_cta_param_2];
 ; SM60-NEXT:    atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic
+    ret i32 %new
 }
 
-define i32 @strong_release_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: strong_release_acquire_i32_global_cta(
+define i32 @release_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_acquire_i32_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b32 %r<4>;
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [strong_release_acquire_i32_global_cta_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [strong_release_acquire_i32_global_cta_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [strong_release_acquire_i32_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_global_cta_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [release_acquire_i32_global_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [release_acquire_i32_global_cta_param_2];
 ; SM60-NEXT:    atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release acquire
+    ret i32 %new
 }
 
-define i32 @strong_release_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: strong_release_seq_cst_i32_global_cta(
+define i32 @release_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_seq_cst_i32_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b32 %r<4>;
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [strong_release_seq_cst_i32_global_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b32 %r1, [strong_release_seq_cst_i32_global_cta_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [strong_release_seq_cst_i32_global_cta_param_2];
+; SM60-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_global_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_global_cta_param_2];
 ; SM60-NEXT:    atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst
+    ret i32 %new
 }
 
-define i32 @strong_acq_rel_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: strong_acq_rel_monotonic_i32_global_cta(
+define i32 @acq_rel_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_monotonic_i32_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b32 %r<4>;
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [strong_acq_rel_monotonic_i32_global_cta_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [strong_acq_rel_monotonic_i32_global_cta_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [strong_acq_rel_monotonic_i32_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_cta_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_global_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_global_cta_param_2];
 ; SM60-NEXT:    atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic
+    ret i32 %new
 }
 
-define i32 @strong_acq_rel_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: strong_acq_rel_acquire_i32_global_cta(
+define i32 @acq_rel_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_acquire_i32_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b32 %r<4>;
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [strong_acq_rel_acquire_i32_global_cta_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [strong_acq_rel_acquire_i32_global_cta_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [strong_acq_rel_acquire_i32_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_global_cta_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_global_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_global_cta_param_2];
 ; SM60-NEXT:    atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
+    ret i32 %new
 }
 
-define i32 @strong_acq_rel_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: strong_acq_rel_seq_cst_i32_global_cta(
+define i32 @acq_rel_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i32_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b32 %r<4>;
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [strong_acq_rel_seq_cst_i32_global_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b32 %r1, [strong_acq_rel_seq_cst_i32_global_cta_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [strong_acq_rel_seq_cst_i32_global_cta_param_2];
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_cta_param_2];
 ; SM60-NEXT:    atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst
+    ret i32 %new
 }
 
-define i32 @strong_seq_cst_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: strong_seq_cst_monotonic_i32_global_cta(
+define i32 @seq_cst_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_monotonic_i32_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b32 %r<4>;
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [strong_seq_cst_monotonic_i32_global_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b32 %r1, [strong_seq_cst_monotonic_i32_global_cta_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [strong_seq_cst_monotonic_i32_global_cta_param_2];
+; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_global_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_global_cta_param_2];
 ; SM60-NEXT:    atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic
+    ret i32 %new
 }
 
-define i32 @strong_seq_cst_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: strong_seq_cst_acquire_i32_global_cta(
+define i32 @seq_cst_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_acquire_i32_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b32 %r<4>;
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [strong_seq_cst_acquire_i32_global_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b32 %r1, [strong_seq_cst_acquire_i32_global_cta_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [strong_seq_cst_acquire_i32_global_cta_param_2];
+; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_global_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_global_cta_param_2];
 ; SM60-NEXT:    atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire
+    ret i32 %new
 }
 
-define i32 @strong_seq_cst_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: strong_seq_cst_seq_cst_i32_global_cta(
+define i32 @seq_cst_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i32_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b32 %r<4>;
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [strong_seq_cst_seq_cst_i32_global_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b32 %r1, [strong_seq_cst_seq_cst_i32_global_cta_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [strong_seq_cst_seq_cst_i32_global_cta_param_2];
+; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_cta_param_2];
 ; SM60-NEXT:    atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst
+    ret i32 %new
 }
 
-define i64 @strong_monotonic_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: strong_monotonic_monotonic_i64_global_cta(
+define i64 @monotonic_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_monotonic_i64_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [strong_monotonic_monotonic_i64_global_cta_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [strong_monotonic_monotonic_i64_global_cta_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [strong_monotonic_monotonic_i64_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_global_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_global_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_global_cta_param_2];
 ; SM60-NEXT:    atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd4;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic
+    ret i64 %new
 }
 
-define i64 @strong_monotonic_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: strong_monotonic_acquire_i64_global_cta(
+define i64 @monotonic_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_acquire_i64_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [strong_monotonic_acquire_i64_global_cta_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [strong_monotonic_acquire_i64_global_cta_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [strong_monotonic_acquire_i64_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_global_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_global_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_global_cta_param_2];
 ; SM60-NEXT:    atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd4;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire
+    ret i64 %new
 }
 
-define i64 @strong_monotonic_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: strong_monotonic_seq_cst_i64_global_cta(
+define i64 @monotonic_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_seq_cst_i64_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [strong_monotonic_seq_cst_i64_global_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b64 %rd2, [strong_monotonic_seq_cst_i64_global_cta_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [strong_monotonic_seq_cst_i64_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_cta_param_2];
 ; SM60-NEXT:    atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd4;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst
+    ret i64 %new
 }
 
-define i64 @strong_acquire_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: strong_acquire_monotonic_i64_global_cta(
+define i64 @acquire_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_monotonic_i64_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [strong_acquire_monotonic_i64_global_cta_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [strong_acquire_monotonic_i64_global_cta_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [strong_acquire_monotonic_i64_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_global_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_global_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_global_cta_param_2];
 ; SM60-NEXT:    atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd4;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic
+    ret i64 %new
 }
 
-define i64 @strong_acquire_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: strong_acquire_acquire_i64_global_cta(
+define i64 @acquire_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_acquire_i64_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [strong_acquire_acquire_i64_global_cta_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [strong_acquire_acquire_i64_global_cta_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [strong_acquire_acquire_i64_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_global_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_global_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_global_cta_param_2];
 ; SM60-NEXT:    atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd4;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire
+    ret i64 %new
 }
 
-define i64 @strong_acquire_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: strong_acquire_seq_cst_i64_global_cta(
+define i64 @acquire_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_seq_cst_i64_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [strong_acquire_seq_cst_i64_global_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b64 %rd2, [strong_acquire_seq_cst_i64_global_cta_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [strong_acquire_seq_cst_i64_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_global_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_global_cta_param_2];
 ; SM60-NEXT:    atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd4;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst
+    ret i64 %new
 }
 
-define i64 @strong_release_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: strong_release_monotonic_i64_global_cta(
+define i64 @release_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_monotonic_i64_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [strong_release_monotonic_i64_global_cta_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [strong_release_monotonic_i64_global_cta_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [strong_release_monotonic_i64_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_global_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_global_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_global_cta_param_2];
 ; SM60-NEXT:    atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd4;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic
+    ret i64 %new
 }
 
-define i64 @strong_release_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: strong_release_acquire_i64_global_cta(
+define i64 @release_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_acquire_i64_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [strong_release_acquire_i64_global_cta_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [strong_release_acquire_i64_global_cta_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [strong_release_acquire_i64_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_global_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_global_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_global_cta_param_2];
 ; SM60-NEXT:    atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd4;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release acquire
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release acquire
+    ret i64 %new
 }
 
-define i64 @strong_release_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: strong_release_seq_cst_i64_global_cta(
+define i64 @release_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_seq_cst_i64_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [strong_release_seq_cst_i64_global_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b64 %rd2, [strong_release_seq_cst_i64_global_cta_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [strong_release_seq_cst_i64_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_global_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_global_cta_param_2];
 ; SM60-NEXT:    atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd4;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst
+    ret i64 %new
 }
 
-define i64 @strong_acq_rel_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: strong_acq_rel_monotonic_i64_global_cta(
+define i64 @acq_rel_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_monotonic_i64_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [strong_acq_rel_monotonic_i64_global_cta_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [strong_acq_rel_monotonic_i64_global_cta_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [strong_acq_rel_monotonic_i64_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_cta_param_2];
 ; SM60-NEXT:    atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd4;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic
+    ret i64 %new
 }
 
-define i64 @strong_acq_rel_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: strong_acq_rel_acquire_i64_global_cta(
+define i64 @acq_rel_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_acquire_i64_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [strong_acq_rel_acquire_i64_global_cta_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [strong_acq_rel_acquire_i64_global_cta_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [strong_acq_rel_acquire_i64_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_global_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_global_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_global_cta_param_2];
 ; SM60-NEXT:    atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd4;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire
+    ret i64 %new
 }
 
-define i64 @strong_acq_rel_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: strong_acq_rel_seq_cst_i64_global_cta(
+define i64 @acq_rel_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i64_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [strong_acq_rel_seq_cst_i64_global_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b64 %rd2, [strong_acq_rel_seq_cst_i64_global_cta_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [strong_acq_rel_seq_cst_i64_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_cta_param_2];
 ; SM60-NEXT:    atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd4;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst
+    ret i64 %new
 }
 
-define i64 @strong_seq_cst_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: strong_seq_cst_monotonic_i64_global_cta(
+define i64 @seq_cst_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_monotonic_i64_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [strong_seq_cst_monotonic_i64_global_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b64 %rd2, [strong_seq_cst_monotonic_i64_global_cta_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [strong_seq_cst_monotonic_i64_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_cta_param_2];
 ; SM60-NEXT:    atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd4;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic
+    ret i64 %new
 }
 
-define i64 @strong_seq_cst_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: strong_seq_cst_acquire_i64_global_cta(
+define i64 @seq_cst_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_acquire_i64_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [strong_seq_cst_acquire_i64_global_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b64 %rd2, [strong_seq_cst_acquire_i64_global_cta_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [strong_seq_cst_acquire_i64_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_global_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_global_cta_param_2];
 ; SM60-NEXT:    atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd4;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire
+    ret i64 %new
 }
 
-define i64 @strong_seq_cst_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: strong_seq_cst_seq_cst_i64_global_cta(
+define i64 @seq_cst_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i64_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [strong_seq_cst_seq_cst_i64_global_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b64 %rd2, [strong_seq_cst_seq_cst_i64_global_cta_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [strong_seq_cst_seq_cst_i64_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_cta_param_2];
 ; SM60-NEXT:    atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd4;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst
+    ret i64 %new
 }
 
-define i8 @strong_acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: strong_acq_rel_acquire_i8_global(
+define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_acquire_i8_global(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b32 %r<18>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [strong_acq_rel_acquire_i8_global_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b8 %r7, [strong_acq_rel_acquire_i8_global_param_1];
+; SM60-NEXT:    ld.param.b8 %r7, [acq_rel_acquire_i8_global_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r8, [strong_acq_rel_acquire_i8_global_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 255;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r17, %r13, %r2;
-; SM60-NEXT:  $L__BB128_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
+; SM60-NEXT:  $L__BB60_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r14, %r17, %r3;
-; SM60-NEXT:    or.b32 %r15, %r17, %r4;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM60-NEXT:    atom.sys.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM60-NEXT:    @%p1 bra $L__BB128_3;
+; SM60-NEXT:    @%p1 bra $L__BB60_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB128_1 Depth=1
+; SM60-NEXT:    // in Loop: Header=BB60_1 Depth=1
 ; SM60-NEXT:    and.b32 %r6, %r5, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM60-NEXT:    mov.b32 %r17, %r6;
-; SM60-NEXT:    @%p2 bra $L__BB128_1;
-; SM60-NEXT:  $L__BB128_3: // %partword.cmpxchg.end
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM60-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM60-NEXT:    mov.b32 %r16, %r6;
+; SM60-NEXT:    @%p2 bra $L__BB60_1;
+; SM60-NEXT:  $L__BB60_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire
+    ret i8 %new
 }
 
-define i32 @strong_acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: strong_acq_rel_acquire_i32_global(
+define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_acquire_i32_global(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b32 %r<4>;
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [strong_acq_rel_acquire_i32_global_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [strong_acq_rel_acquire_i32_global_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [strong_acq_rel_acquire_i32_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_global_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_global_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_global_param_2];
 ; SM60-NEXT:    atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire
+    ret i32 %new
 }
 
-define i32 @strong_acq_rel_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: strong_acq_rel_acquire_i32_global_sys(
+define i32 @acq_rel_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_acquire_i32_global_sys(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b32 %r<4>;
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [strong_acq_rel_acquire_i32_global_sys_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [strong_acq_rel_acquire_i32_global_sys_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [strong_acq_rel_acquire_i32_global_sys_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_global_sys_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_global_sys_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_global_sys_param_2];
 ; SM60-NEXT:    atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire
+    ret i32 %new
 }
 
-define i32 @strong_acq_rel_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: strong_acq_rel_acquire_i32_global_gpu(
+define i32 @acq_rel_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_acquire_i32_global_gpu(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b32 %r<4>;
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [strong_acq_rel_acquire_i32_global_gpu_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [strong_acq_rel_acquire_i32_global_gpu_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [strong_acq_rel_acquire_i32_global_gpu_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_global_gpu_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_global_gpu_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_global_gpu_param_2];
 ; SM60-NEXT:    atom.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire
+    ret i32 %new
 }
 
-define i8 @strong_acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: strong_acq_rel_acquire_i8_generic_cta(
+define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_acquire_i8_generic_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b32 %r<18>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [strong_acq_rel_acquire_i8_generic_cta_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b8 %r7, [strong_acq_rel_acquire_i8_generic_cta_param_1];
+; SM60-NEXT:    ld.param.b8 %r7, [acq_rel_acquire_i8_generic_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r8, [strong_acq_rel_acquire_i8_generic_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 255;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM60-NEXT:    ld.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r17, %r13, %r2;
-; SM60-NEXT:  $L__BB132_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
+; SM60-NEXT:  $L__BB64_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r14, %r17, %r3;
-; SM60-NEXT:    or.b32 %r15, %r17, %r4;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM60-NEXT:    atom.cta.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM60-NEXT:    @%p1 bra $L__BB132_3;
+; SM60-NEXT:    @%p1 bra $L__BB64_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB132_1 Depth=1
+; SM60-NEXT:    // in Loop: Header=BB64_1 Depth=1
 ; SM60-NEXT:    and.b32 %r6, %r5, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM60-NEXT:    mov.b32 %r17, %r6;
-; SM60-NEXT:    @%p2 bra $L__BB132_1;
-; SM60-NEXT:  $L__BB132_3: // %partword.cmpxchg.end
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM60-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM60-NEXT:    mov.b32 %r16, %r6;
+; SM60-NEXT:    @%p2 bra $L__BB64_1;
+; SM60-NEXT:  $L__BB64_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
+    ret i8 %new
 }
 
-define i8 @strong_acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: strong_acq_rel_acquire_i8_shared_cta(
+define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_acquire_i8_shared_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b32 %r<18>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd2, [strong_acq_rel_acquire_i8_shared_cta_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b8 %r7, [strong_acq_rel_acquire_i8_shared_cta_param_1];
+; SM60-NEXT:    ld.param.b8 %r7, [acq_rel_acquire_i8_shared_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r8, [strong_acq_rel_acquire_i8_shared_cta_param_2];
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 255;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM60-NEXT:    ld.shared.b32 %r13, [%rd1];
-; SM60-NEXT:    and.b32 %r17, %r13, %r2;
-; SM60-NEXT:  $L__BB133_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
+; SM60-NEXT:  $L__BB65_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r14, %r17, %r3;
-; SM60-NEXT:    or.b32 %r15, %r17, %r4;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM60-NEXT:    atom.cta.shared.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM60-NEXT:    @%p1 bra $L__BB133_3;
+; SM60-NEXT:    @%p1 bra $L__BB65_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB133_1 Depth=1
+; SM60-NEXT:    // in Loop: Header=BB65_1 Depth=1
 ; SM60-NEXT:    and.b32 %r6, %r5, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM60-NEXT:    mov.b32 %r17, %r6;
-; SM60-NEXT:    @%p2 bra $L__BB133_1;
-; SM60-NEXT:  $L__BB133_3: // %partword.cmpxchg.end
-; SM60-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM60-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM60-NEXT:    mov.b32 %r16, %r6;
+; SM60-NEXT:    @%p2 bra $L__BB65_1;
+; SM60-NEXT:  $L__BB65_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
+    ret i8 %new
 }
 
-define i32 @strong_acq_rel_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: strong_acq_rel_acquire_i32_generic_cta(
+define i32 @acq_rel_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_acquire_i32_generic_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b32 %r<4>;
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [strong_acq_rel_acquire_i32_generic_cta_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [strong_acq_rel_acquire_i32_generic_cta_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [strong_acq_rel_acquire_i32_generic_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_cta_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_generic_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_generic_cta_param_2];
 ; SM60-NEXT:    atom.cta.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
+    ret i32 %new
 }
 
-define i32 @strong_acq_rel_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: strong_acq_rel_acquire_i32_shared_cta(
+define i32 @acq_rel_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_acquire_i32_shared_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b32 %r<4>;
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [strong_acq_rel_acquire_i32_shared_cta_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [strong_acq_rel_acquire_i32_shared_cta_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [strong_acq_rel_acquire_i32_shared_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_cta_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_shared_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_shared_cta_param_2];
 ; SM60-NEXT:    atom.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
+    ret i32 %new
 }
 

diff  --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll
index be3a81dea77c6..76220ee3a3996 100644
--- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll
@@ -1,3878 +1,2104 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | FileCheck %s --check-prefix=SM70
-; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %}
+; RUN: %if ptxas-sm_70 && ptxas-isa-6.3 %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %}
 
-define i8 @weak_monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: weak_monotonic_monotonic_i8_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<17>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [weak_monotonic_monotonic_i8_global_cta_param_0];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r6, [weak_monotonic_monotonic_i8_global_cta_param_1];
-; SM70-NEXT:    ld.param.b8 %r7, [weak_monotonic_monotonic_i8_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM70-NEXT:    and.b32 %r9, %r8, 3;
-; SM70-NEXT:    shl.b32 %r1, %r9, 3;
-; SM70-NEXT:    mov.b32 %r10, 255;
-; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM70-NEXT:    not.b32 %r12, %r11;
-; SM70-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM70-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r4, %r13, %r12;
-; SM70-NEXT:    or.b32 %r14, %r4, %r2;
-; SM70-NEXT:    or.b32 %r15, %r4, %r3;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i8 @weak_monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: weak_monotonic_acquire_i8_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<17>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [weak_monotonic_acquire_i8_global_cta_param_0];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r6, [weak_monotonic_acquire_i8_global_cta_param_1];
-; SM70-NEXT:    ld.param.b8 %r7, [weak_monotonic_acquire_i8_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM70-NEXT:    and.b32 %r9, %r8, 3;
-; SM70-NEXT:    shl.b32 %r1, %r9, 3;
-; SM70-NEXT:    mov.b32 %r10, 255;
-; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM70-NEXT:    not.b32 %r12, %r11;
-; SM70-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM70-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r4, %r13, %r12;
-; SM70-NEXT:    or.b32 %r14, %r4, %r2;
-; SM70-NEXT:    or.b32 %r15, %r4, %r3;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i8 @weak_monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: weak_monotonic_seq_cst_i8_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<17>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [weak_monotonic_seq_cst_i8_global_cta_param_0];
-; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b8 %r6, [weak_monotonic_seq_cst_i8_global_cta_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r7, [weak_monotonic_seq_cst_i8_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM70-NEXT:    and.b32 %r9, %r8, 3;
-; SM70-NEXT:    shl.b32 %r1, %r9, 3;
-; SM70-NEXT:    mov.b32 %r10, 255;
-; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM70-NEXT:    not.b32 %r12, %r11;
-; SM70-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM70-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r4, %r13, %r12;
-; SM70-NEXT:    or.b32 %r14, %r4, %r2;
-; SM70-NEXT:    or.b32 %r15, %r4, %r3;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i8 @weak_acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: weak_acquire_monotonic_i8_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<17>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [weak_acquire_monotonic_i8_global_cta_param_0];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r6, [weak_acquire_monotonic_i8_global_cta_param_1];
-; SM70-NEXT:    ld.param.b8 %r7, [weak_acquire_monotonic_i8_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM70-NEXT:    and.b32 %r9, %r8, 3;
-; SM70-NEXT:    shl.b32 %r1, %r9, 3;
-; SM70-NEXT:    mov.b32 %r10, 255;
-; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM70-NEXT:    not.b32 %r12, %r11;
-; SM70-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM70-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r4, %r13, %r12;
-; SM70-NEXT:    or.b32 %r14, %r4, %r2;
-; SM70-NEXT:    or.b32 %r15, %r4, %r3;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i8 @weak_acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: weak_acquire_acquire_i8_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<17>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [weak_acquire_acquire_i8_global_cta_param_0];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r6, [weak_acquire_acquire_i8_global_cta_param_1];
-; SM70-NEXT:    ld.param.b8 %r7, [weak_acquire_acquire_i8_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM70-NEXT:    and.b32 %r9, %r8, 3;
-; SM70-NEXT:    shl.b32 %r1, %r9, 3;
-; SM70-NEXT:    mov.b32 %r10, 255;
-; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM70-NEXT:    not.b32 %r12, %r11;
-; SM70-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM70-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r4, %r13, %r12;
-; SM70-NEXT:    or.b32 %r14, %r4, %r2;
-; SM70-NEXT:    or.b32 %r15, %r4, %r3;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i8 @weak_acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: weak_acquire_seq_cst_i8_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<17>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [weak_acquire_seq_cst_i8_global_cta_param_0];
-; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b8 %r6, [weak_acquire_seq_cst_i8_global_cta_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r7, [weak_acquire_seq_cst_i8_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM70-NEXT:    and.b32 %r9, %r8, 3;
-; SM70-NEXT:    shl.b32 %r1, %r9, 3;
-; SM70-NEXT:    mov.b32 %r10, 255;
-; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM70-NEXT:    not.b32 %r12, %r11;
-; SM70-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM70-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r4, %r13, %r12;
-; SM70-NEXT:    or.b32 %r14, %r4, %r2;
-; SM70-NEXT:    or.b32 %r15, %r4, %r3;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i8 @weak_release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: weak_release_monotonic_i8_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<17>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [weak_release_monotonic_i8_global_cta_param_0];
-; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    ld.param.b8 %r6, [weak_release_monotonic_i8_global_cta_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r7, [weak_release_monotonic_i8_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM70-NEXT:    and.b32 %r9, %r8, 3;
-; SM70-NEXT:    shl.b32 %r1, %r9, 3;
-; SM70-NEXT:    mov.b32 %r10, 255;
-; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM70-NEXT:    not.b32 %r12, %r11;
-; SM70-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM70-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r4, %r13, %r12;
-; SM70-NEXT:    or.b32 %r14, %r4, %r2;
-; SM70-NEXT:    or.b32 %r15, %r4, %r3;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i8 @weak_release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: weak_release_acquire_i8_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<17>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [weak_release_acquire_i8_global_cta_param_0];
-; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    ld.param.b8 %r6, [weak_release_acquire_i8_global_cta_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r7, [weak_release_acquire_i8_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM70-NEXT:    and.b32 %r9, %r8, 3;
-; SM70-NEXT:    shl.b32 %r1, %r9, 3;
-; SM70-NEXT:    mov.b32 %r10, 255;
-; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM70-NEXT:    not.b32 %r12, %r11;
-; SM70-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM70-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r4, %r13, %r12;
-; SM70-NEXT:    or.b32 %r14, %r4, %r2;
-; SM70-NEXT:    or.b32 %r15, %r4, %r3;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i8 @weak_release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: weak_release_seq_cst_i8_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<17>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [weak_release_seq_cst_i8_global_cta_param_0];
-; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b8 %r6, [weak_release_seq_cst_i8_global_cta_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r7, [weak_release_seq_cst_i8_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM70-NEXT:    and.b32 %r9, %r8, 3;
-; SM70-NEXT:    shl.b32 %r1, %r9, 3;
-; SM70-NEXT:    mov.b32 %r10, 255;
-; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM70-NEXT:    not.b32 %r12, %r11;
-; SM70-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM70-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r4, %r13, %r12;
-; SM70-NEXT:    or.b32 %r14, %r4, %r2;
-; SM70-NEXT:    or.b32 %r15, %r4, %r3;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i8 @weak_acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: weak_acq_rel_monotonic_i8_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<17>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [weak_acq_rel_monotonic_i8_global_cta_param_0];
-; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    ld.param.b8 %r6, [weak_acq_rel_monotonic_i8_global_cta_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r7, [weak_acq_rel_monotonic_i8_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM70-NEXT:    and.b32 %r9, %r8, 3;
-; SM70-NEXT:    shl.b32 %r1, %r9, 3;
-; SM70-NEXT:    mov.b32 %r10, 255;
-; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM70-NEXT:    not.b32 %r12, %r11;
-; SM70-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM70-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r4, %r13, %r12;
-; SM70-NEXT:    or.b32 %r14, %r4, %r2;
-; SM70-NEXT:    or.b32 %r15, %r4, %r3;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i8 @weak_acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: weak_acq_rel_acquire_i8_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<17>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [weak_acq_rel_acquire_i8_global_cta_param_0];
-; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    ld.param.b8 %r6, [weak_acq_rel_acquire_i8_global_cta_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r7, [weak_acq_rel_acquire_i8_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM70-NEXT:    and.b32 %r9, %r8, 3;
-; SM70-NEXT:    shl.b32 %r1, %r9, 3;
-; SM70-NEXT:    mov.b32 %r10, 255;
-; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM70-NEXT:    not.b32 %r12, %r11;
-; SM70-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM70-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r4, %r13, %r12;
-; SM70-NEXT:    or.b32 %r14, %r4, %r2;
-; SM70-NEXT:    or.b32 %r15, %r4, %r3;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i8 @weak_acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: weak_acq_rel_seq_cst_i8_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<17>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [weak_acq_rel_seq_cst_i8_global_cta_param_0];
-; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b8 %r6, [weak_acq_rel_seq_cst_i8_global_cta_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r7, [weak_acq_rel_seq_cst_i8_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM70-NEXT:    and.b32 %r9, %r8, 3;
-; SM70-NEXT:    shl.b32 %r1, %r9, 3;
-; SM70-NEXT:    mov.b32 %r10, 255;
-; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM70-NEXT:    not.b32 %r12, %r11;
-; SM70-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM70-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r4, %r13, %r12;
-; SM70-NEXT:    or.b32 %r14, %r4, %r2;
-; SM70-NEXT:    or.b32 %r15, %r4, %r3;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i8 @weak_seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: weak_seq_cst_monotonic_i8_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<17>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [weak_seq_cst_monotonic_i8_global_cta_param_0];
-; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b8 %r6, [weak_seq_cst_monotonic_i8_global_cta_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r7, [weak_seq_cst_monotonic_i8_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM70-NEXT:    and.b32 %r9, %r8, 3;
-; SM70-NEXT:    shl.b32 %r1, %r9, 3;
-; SM70-NEXT:    mov.b32 %r10, 255;
-; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM70-NEXT:    not.b32 %r12, %r11;
-; SM70-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM70-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r4, %r13, %r12;
-; SM70-NEXT:    or.b32 %r14, %r4, %r2;
-; SM70-NEXT:    or.b32 %r15, %r4, %r3;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i8 @weak_seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: weak_seq_cst_acquire_i8_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<17>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [weak_seq_cst_acquire_i8_global_cta_param_0];
-; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b8 %r6, [weak_seq_cst_acquire_i8_global_cta_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r7, [weak_seq_cst_acquire_i8_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM70-NEXT:    and.b32 %r9, %r8, 3;
-; SM70-NEXT:    shl.b32 %r1, %r9, 3;
-; SM70-NEXT:    mov.b32 %r10, 255;
-; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM70-NEXT:    not.b32 %r12, %r11;
-; SM70-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM70-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r4, %r13, %r12;
-; SM70-NEXT:    or.b32 %r14, %r4, %r2;
-; SM70-NEXT:    or.b32 %r15, %r4, %r3;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i8 @weak_seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: weak_seq_cst_seq_cst_i8_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<17>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [weak_seq_cst_seq_cst_i8_global_cta_param_0];
-; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b8 %r6, [weak_seq_cst_seq_cst_i8_global_cta_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r7, [weak_seq_cst_seq_cst_i8_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM70-NEXT:    and.b32 %r9, %r8, 3;
-; SM70-NEXT:    shl.b32 %r1, %r9, 3;
-; SM70-NEXT:    mov.b32 %r10, 255;
-; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM70-NEXT:    not.b32 %r12, %r11;
-; SM70-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM70-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r4, %r13, %r12;
-; SM70-NEXT:    or.b32 %r14, %r4, %r2;
-; SM70-NEXT:    or.b32 %r15, %r4, %r3;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i16 @weak_monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: weak_monotonic_monotonic_i16_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<17>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [weak_monotonic_monotonic_i16_global_cta_param_0];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r6, [weak_monotonic_monotonic_i16_global_cta_param_1];
-; SM70-NEXT:    ld.param.b16 %r7, [weak_monotonic_monotonic_i16_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM70-NEXT:    and.b32 %r9, %r8, 3;
-; SM70-NEXT:    shl.b32 %r1, %r9, 3;
-; SM70-NEXT:    mov.b32 %r10, 65535;
-; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM70-NEXT:    not.b32 %r12, %r11;
-; SM70-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM70-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r4, %r13, %r12;
-; SM70-NEXT:    or.b32 %r14, %r4, %r2;
-; SM70-NEXT:    or.b32 %r15, %r4, %r3;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
-}
-
-define i16 @weak_monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: weak_monotonic_acquire_i16_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<17>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [weak_monotonic_acquire_i16_global_cta_param_0];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r6, [weak_monotonic_acquire_i16_global_cta_param_1];
-; SM70-NEXT:    ld.param.b16 %r7, [weak_monotonic_acquire_i16_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM70-NEXT:    and.b32 %r9, %r8, 3;
-; SM70-NEXT:    shl.b32 %r1, %r9, 3;
-; SM70-NEXT:    mov.b32 %r10, 65535;
-; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM70-NEXT:    not.b32 %r12, %r11;
-; SM70-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM70-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r4, %r13, %r12;
-; SM70-NEXT:    or.b32 %r14, %r4, %r2;
-; SM70-NEXT:    or.b32 %r15, %r4, %r3;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
-}
-
-define i16 @weak_monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: weak_monotonic_seq_cst_i16_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<17>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [weak_monotonic_seq_cst_i16_global_cta_param_0];
-; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b16 %r6, [weak_monotonic_seq_cst_i16_global_cta_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r7, [weak_monotonic_seq_cst_i16_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM70-NEXT:    and.b32 %r9, %r8, 3;
-; SM70-NEXT:    shl.b32 %r1, %r9, 3;
-; SM70-NEXT:    mov.b32 %r10, 65535;
-; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM70-NEXT:    not.b32 %r12, %r11;
-; SM70-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM70-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r4, %r13, %r12;
-; SM70-NEXT:    or.b32 %r14, %r4, %r2;
-; SM70-NEXT:    or.b32 %r15, %r4, %r3;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
-}
-
-define i16 @weak_acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: weak_acquire_monotonic_i16_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<17>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [weak_acquire_monotonic_i16_global_cta_param_0];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r6, [weak_acquire_monotonic_i16_global_cta_param_1];
-; SM70-NEXT:    ld.param.b16 %r7, [weak_acquire_monotonic_i16_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM70-NEXT:    and.b32 %r9, %r8, 3;
-; SM70-NEXT:    shl.b32 %r1, %r9, 3;
-; SM70-NEXT:    mov.b32 %r10, 65535;
-; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM70-NEXT:    not.b32 %r12, %r11;
-; SM70-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM70-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r4, %r13, %r12;
-; SM70-NEXT:    or.b32 %r14, %r4, %r2;
-; SM70-NEXT:    or.b32 %r15, %r4, %r3;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
-}
-
-define i16 @weak_acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: weak_acquire_acquire_i16_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<17>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [weak_acquire_acquire_i16_global_cta_param_0];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r6, [weak_acquire_acquire_i16_global_cta_param_1];
-; SM70-NEXT:    ld.param.b16 %r7, [weak_acquire_acquire_i16_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM70-NEXT:    and.b32 %r9, %r8, 3;
-; SM70-NEXT:    shl.b32 %r1, %r9, 3;
-; SM70-NEXT:    mov.b32 %r10, 65535;
-; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM70-NEXT:    not.b32 %r12, %r11;
-; SM70-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM70-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r4, %r13, %r12;
-; SM70-NEXT:    or.b32 %r14, %r4, %r2;
-; SM70-NEXT:    or.b32 %r15, %r4, %r3;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
-}
-
-define i16 @weak_acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: weak_acquire_seq_cst_i16_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<17>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [weak_acquire_seq_cst_i16_global_cta_param_0];
-; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b16 %r6, [weak_acquire_seq_cst_i16_global_cta_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r7, [weak_acquire_seq_cst_i16_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM70-NEXT:    and.b32 %r9, %r8, 3;
-; SM70-NEXT:    shl.b32 %r1, %r9, 3;
-; SM70-NEXT:    mov.b32 %r10, 65535;
-; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM70-NEXT:    not.b32 %r12, %r11;
-; SM70-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM70-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r4, %r13, %r12;
-; SM70-NEXT:    or.b32 %r14, %r4, %r2;
-; SM70-NEXT:    or.b32 %r15, %r4, %r3;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
-}
-
-define i16 @weak_release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: weak_release_monotonic_i16_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<17>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [weak_release_monotonic_i16_global_cta_param_0];
-; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    ld.param.b16 %r6, [weak_release_monotonic_i16_global_cta_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r7, [weak_release_monotonic_i16_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM70-NEXT:    and.b32 %r9, %r8, 3;
-; SM70-NEXT:    shl.b32 %r1, %r9, 3;
-; SM70-NEXT:    mov.b32 %r10, 65535;
-; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM70-NEXT:    not.b32 %r12, %r11;
-; SM70-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM70-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r4, %r13, %r12;
-; SM70-NEXT:    or.b32 %r14, %r4, %r2;
-; SM70-NEXT:    or.b32 %r15, %r4, %r3;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
-}
-
-define i16 @weak_release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: weak_release_acquire_i16_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<17>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [weak_release_acquire_i16_global_cta_param_0];
-; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    ld.param.b16 %r6, [weak_release_acquire_i16_global_cta_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r7, [weak_release_acquire_i16_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM70-NEXT:    and.b32 %r9, %r8, 3;
-; SM70-NEXT:    shl.b32 %r1, %r9, 3;
-; SM70-NEXT:    mov.b32 %r10, 65535;
-; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM70-NEXT:    not.b32 %r12, %r11;
-; SM70-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM70-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r4, %r13, %r12;
-; SM70-NEXT:    or.b32 %r14, %r4, %r2;
-; SM70-NEXT:    or.b32 %r15, %r4, %r3;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release acquire
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
-}
-
-define i16 @weak_release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: weak_release_seq_cst_i16_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<17>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [weak_release_seq_cst_i16_global_cta_param_0];
-; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b16 %r6, [weak_release_seq_cst_i16_global_cta_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r7, [weak_release_seq_cst_i16_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM70-NEXT:    and.b32 %r9, %r8, 3;
-; SM70-NEXT:    shl.b32 %r1, %r9, 3;
-; SM70-NEXT:    mov.b32 %r10, 65535;
-; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM70-NEXT:    not.b32 %r12, %r11;
-; SM70-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM70-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r4, %r13, %r12;
-; SM70-NEXT:    or.b32 %r14, %r4, %r2;
-; SM70-NEXT:    or.b32 %r15, %r4, %r3;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
-}
-
-define i16 @weak_acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: weak_acq_rel_monotonic_i16_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<17>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [weak_acq_rel_monotonic_i16_global_cta_param_0];
-; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    ld.param.b16 %r6, [weak_acq_rel_monotonic_i16_global_cta_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r7, [weak_acq_rel_monotonic_i16_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM70-NEXT:    and.b32 %r9, %r8, 3;
-; SM70-NEXT:    shl.b32 %r1, %r9, 3;
-; SM70-NEXT:    mov.b32 %r10, 65535;
-; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM70-NEXT:    not.b32 %r12, %r11;
-; SM70-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM70-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r4, %r13, %r12;
-; SM70-NEXT:    or.b32 %r14, %r4, %r2;
-; SM70-NEXT:    or.b32 %r15, %r4, %r3;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
-}
-
-define i16 @weak_acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: weak_acq_rel_acquire_i16_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<17>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [weak_acq_rel_acquire_i16_global_cta_param_0];
-; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    ld.param.b16 %r6, [weak_acq_rel_acquire_i16_global_cta_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r7, [weak_acq_rel_acquire_i16_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM70-NEXT:    and.b32 %r9, %r8, 3;
-; SM70-NEXT:    shl.b32 %r1, %r9, 3;
-; SM70-NEXT:    mov.b32 %r10, 65535;
-; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM70-NEXT:    not.b32 %r12, %r11;
-; SM70-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM70-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r4, %r13, %r12;
-; SM70-NEXT:    or.b32 %r14, %r4, %r2;
-; SM70-NEXT:    or.b32 %r15, %r4, %r3;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
-}
-
-define i16 @weak_acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: weak_acq_rel_seq_cst_i16_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<17>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [weak_acq_rel_seq_cst_i16_global_cta_param_0];
-; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b16 %r6, [weak_acq_rel_seq_cst_i16_global_cta_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r7, [weak_acq_rel_seq_cst_i16_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM70-NEXT:    and.b32 %r9, %r8, 3;
-; SM70-NEXT:    shl.b32 %r1, %r9, 3;
-; SM70-NEXT:    mov.b32 %r10, 65535;
-; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM70-NEXT:    not.b32 %r12, %r11;
-; SM70-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM70-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r4, %r13, %r12;
-; SM70-NEXT:    or.b32 %r14, %r4, %r2;
-; SM70-NEXT:    or.b32 %r15, %r4, %r3;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
-}
-
-define i16 @weak_seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: weak_seq_cst_monotonic_i16_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<17>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [weak_seq_cst_monotonic_i16_global_cta_param_0];
-; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b16 %r6, [weak_seq_cst_monotonic_i16_global_cta_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r7, [weak_seq_cst_monotonic_i16_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM70-NEXT:    and.b32 %r9, %r8, 3;
-; SM70-NEXT:    shl.b32 %r1, %r9, 3;
-; SM70-NEXT:    mov.b32 %r10, 65535;
-; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM70-NEXT:    not.b32 %r12, %r11;
-; SM70-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM70-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r4, %r13, %r12;
-; SM70-NEXT:    or.b32 %r14, %r4, %r2;
-; SM70-NEXT:    or.b32 %r15, %r4, %r3;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
-}
-
-define i16 @weak_seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: weak_seq_cst_acquire_i16_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<17>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [weak_seq_cst_acquire_i16_global_cta_param_0];
-; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b16 %r6, [weak_seq_cst_acquire_i16_global_cta_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r7, [weak_seq_cst_acquire_i16_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM70-NEXT:    and.b32 %r9, %r8, 3;
-; SM70-NEXT:    shl.b32 %r1, %r9, 3;
-; SM70-NEXT:    mov.b32 %r10, 65535;
-; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM70-NEXT:    not.b32 %r12, %r11;
-; SM70-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM70-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r4, %r13, %r12;
-; SM70-NEXT:    or.b32 %r14, %r4, %r2;
-; SM70-NEXT:    or.b32 %r15, %r4, %r3;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
-}
-
-define i16 @weak_seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: weak_seq_cst_seq_cst_i16_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<17>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [weak_seq_cst_seq_cst_i16_global_cta_param_0];
-; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b16 %r6, [weak_seq_cst_seq_cst_i16_global_cta_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r7, [weak_seq_cst_seq_cst_i16_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM70-NEXT:    and.b32 %r9, %r8, 3;
-; SM70-NEXT:    shl.b32 %r1, %r9, 3;
-; SM70-NEXT:    mov.b32 %r10, 65535;
-; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM70-NEXT:    not.b32 %r12, %r11;
-; SM70-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM70-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r4, %r13, %r12;
-; SM70-NEXT:    or.b32 %r14, %r4, %r2;
-; SM70-NEXT:    or.b32 %r15, %r4, %r3;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
-}
-
-define i32 @weak_monotonic_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: weak_monotonic_monotonic_i32_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [weak_monotonic_monotonic_i32_global_cta_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [weak_monotonic_monotonic_i32_global_cta_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [weak_monotonic_monotonic_i32_global_cta_param_2];
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_monotonic_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: weak_monotonic_acquire_i32_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [weak_monotonic_acquire_i32_global_cta_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [weak_monotonic_acquire_i32_global_cta_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [weak_monotonic_acquire_i32_global_cta_param_2];
-; SM70-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_monotonic_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: weak_monotonic_seq_cst_i32_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [weak_monotonic_seq_cst_i32_global_cta_param_0];
-; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b32 %r1, [weak_monotonic_seq_cst_i32_global_cta_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [weak_monotonic_seq_cst_i32_global_cta_param_2];
-; SM70-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_acquire_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: weak_acquire_monotonic_i32_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [weak_acquire_monotonic_i32_global_cta_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [weak_acquire_monotonic_i32_global_cta_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [weak_acquire_monotonic_i32_global_cta_param_2];
-; SM70-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_acquire_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: weak_acquire_acquire_i32_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [weak_acquire_acquire_i32_global_cta_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [weak_acquire_acquire_i32_global_cta_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [weak_acquire_acquire_i32_global_cta_param_2];
-; SM70-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_acquire_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: weak_acquire_seq_cst_i32_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [weak_acquire_seq_cst_i32_global_cta_param_0];
-; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b32 %r1, [weak_acquire_seq_cst_i32_global_cta_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [weak_acquire_seq_cst_i32_global_cta_param_2];
-; SM70-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_release_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: weak_release_monotonic_i32_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [weak_release_monotonic_i32_global_cta_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [weak_release_monotonic_i32_global_cta_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [weak_release_monotonic_i32_global_cta_param_2];
-; SM70-NEXT:    atom.release.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_release_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: weak_release_acquire_i32_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [weak_release_acquire_i32_global_cta_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [weak_release_acquire_i32_global_cta_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [weak_release_acquire_i32_global_cta_param_2];
-; SM70-NEXT:    atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_release_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: weak_release_seq_cst_i32_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [weak_release_seq_cst_i32_global_cta_param_0];
-; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b32 %r1, [weak_release_seq_cst_i32_global_cta_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [weak_release_seq_cst_i32_global_cta_param_2];
-; SM70-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_acq_rel_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: weak_acq_rel_monotonic_i32_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [weak_acq_rel_monotonic_i32_global_cta_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [weak_acq_rel_monotonic_i32_global_cta_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [weak_acq_rel_monotonic_i32_global_cta_param_2];
-; SM70-NEXT:    atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_acq_rel_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: weak_acq_rel_acquire_i32_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [weak_acq_rel_acquire_i32_global_cta_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [weak_acq_rel_acquire_i32_global_cta_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [weak_acq_rel_acquire_i32_global_cta_param_2];
-; SM70-NEXT:    atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_acq_rel_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: weak_acq_rel_seq_cst_i32_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [weak_acq_rel_seq_cst_i32_global_cta_param_0];
-; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b32 %r1, [weak_acq_rel_seq_cst_i32_global_cta_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [weak_acq_rel_seq_cst_i32_global_cta_param_2];
-; SM70-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_seq_cst_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: weak_seq_cst_monotonic_i32_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [weak_seq_cst_monotonic_i32_global_cta_param_0];
-; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b32 %r1, [weak_seq_cst_monotonic_i32_global_cta_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [weak_seq_cst_monotonic_i32_global_cta_param_2];
-; SM70-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_seq_cst_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: weak_seq_cst_acquire_i32_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [weak_seq_cst_acquire_i32_global_cta_param_0];
-; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b32 %r1, [weak_seq_cst_acquire_i32_global_cta_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [weak_seq_cst_acquire_i32_global_cta_param_2];
-; SM70-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_seq_cst_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: weak_seq_cst_seq_cst_i32_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [weak_seq_cst_seq_cst_i32_global_cta_param_0];
-; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b32 %r1, [weak_seq_cst_seq_cst_i32_global_cta_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [weak_seq_cst_seq_cst_i32_global_cta_param_2];
-; SM70-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i64 @weak_monotonic_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: weak_monotonic_monotonic_i64_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<5>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [weak_monotonic_monotonic_i64_global_cta_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [weak_monotonic_monotonic_i64_global_cta_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [weak_monotonic_monotonic_i64_global_cta_param_2];
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd4;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
-}
-
-define i64 @weak_monotonic_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: weak_monotonic_acquire_i64_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<5>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [weak_monotonic_acquire_i64_global_cta_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [weak_monotonic_acquire_i64_global_cta_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [weak_monotonic_acquire_i64_global_cta_param_2];
-; SM70-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd4;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
-}
-
-define i64 @weak_monotonic_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: weak_monotonic_seq_cst_i64_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<5>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [weak_monotonic_seq_cst_i64_global_cta_param_0];
-; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b64 %rd2, [weak_monotonic_seq_cst_i64_global_cta_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [weak_monotonic_seq_cst_i64_global_cta_param_2];
-; SM70-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd4;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
-}
-
-define i64 @weak_acquire_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: weak_acquire_monotonic_i64_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<5>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [weak_acquire_monotonic_i64_global_cta_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [weak_acquire_monotonic_i64_global_cta_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [weak_acquire_monotonic_i64_global_cta_param_2];
-; SM70-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd4;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
-}
-
-define i64 @weak_acquire_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: weak_acquire_acquire_i64_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<5>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [weak_acquire_acquire_i64_global_cta_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [weak_acquire_acquire_i64_global_cta_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [weak_acquire_acquire_i64_global_cta_param_2];
-; SM70-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd4;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
-}
-
-define i64 @weak_acquire_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: weak_acquire_seq_cst_i64_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<5>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [weak_acquire_seq_cst_i64_global_cta_param_0];
-; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b64 %rd2, [weak_acquire_seq_cst_i64_global_cta_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [weak_acquire_seq_cst_i64_global_cta_param_2];
-; SM70-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd4;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
-}
-
-define i64 @weak_release_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: weak_release_monotonic_i64_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<5>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [weak_release_monotonic_i64_global_cta_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [weak_release_monotonic_i64_global_cta_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [weak_release_monotonic_i64_global_cta_param_2];
-; SM70-NEXT:    atom.release.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd4;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
-}
-
-define i64 @weak_release_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: weak_release_acquire_i64_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<5>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [weak_release_acquire_i64_global_cta_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [weak_release_acquire_i64_global_cta_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [weak_release_acquire_i64_global_cta_param_2];
-; SM70-NEXT:    atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd4;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release acquire
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
-}
-
-define i64 @weak_release_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: weak_release_seq_cst_i64_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<5>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [weak_release_seq_cst_i64_global_cta_param_0];
-; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b64 %rd2, [weak_release_seq_cst_i64_global_cta_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [weak_release_seq_cst_i64_global_cta_param_2];
-; SM70-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd4;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
-}
-
-define i64 @weak_acq_rel_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: weak_acq_rel_monotonic_i64_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<5>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [weak_acq_rel_monotonic_i64_global_cta_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [weak_acq_rel_monotonic_i64_global_cta_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [weak_acq_rel_monotonic_i64_global_cta_param_2];
-; SM70-NEXT:    atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd4;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
-}
-
-define i64 @weak_acq_rel_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: weak_acq_rel_acquire_i64_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<5>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [weak_acq_rel_acquire_i64_global_cta_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [weak_acq_rel_acquire_i64_global_cta_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [weak_acq_rel_acquire_i64_global_cta_param_2];
-; SM70-NEXT:    atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd4;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
-}
-
-define i64 @weak_acq_rel_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: weak_acq_rel_seq_cst_i64_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<5>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [weak_acq_rel_seq_cst_i64_global_cta_param_0];
-; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b64 %rd2, [weak_acq_rel_seq_cst_i64_global_cta_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [weak_acq_rel_seq_cst_i64_global_cta_param_2];
-; SM70-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd4;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
-}
-
-define i64 @weak_seq_cst_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: weak_seq_cst_monotonic_i64_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<5>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [weak_seq_cst_monotonic_i64_global_cta_param_0];
-; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b64 %rd2, [weak_seq_cst_monotonic_i64_global_cta_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [weak_seq_cst_monotonic_i64_global_cta_param_2];
-; SM70-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd4;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
-}
-
-define i64 @weak_seq_cst_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: weak_seq_cst_acquire_i64_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<5>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [weak_seq_cst_acquire_i64_global_cta_param_0];
-; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b64 %rd2, [weak_seq_cst_acquire_i64_global_cta_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [weak_seq_cst_acquire_i64_global_cta_param_2];
-; SM70-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd4;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
-}
-
-define i64 @weak_seq_cst_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: weak_seq_cst_seq_cst_i64_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<5>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [weak_seq_cst_seq_cst_i64_global_cta_param_0];
-; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b64 %rd2, [weak_seq_cst_seq_cst_i64_global_cta_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [weak_seq_cst_seq_cst_i64_global_cta_param_2];
-; SM70-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd4;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
-}
-
-define i8 @weak_acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: weak_acq_rel_acquire_i8_global(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<17>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [weak_acq_rel_acquire_i8_global_param_0];
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.b8 %r6, [weak_acq_rel_acquire_i8_global_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r7, [weak_acq_rel_acquire_i8_global_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM70-NEXT:    and.b32 %r9, %r8, 3;
-; SM70-NEXT:    shl.b32 %r1, %r9, 3;
-; SM70-NEXT:    mov.b32 %r10, 255;
-; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM70-NEXT:    not.b32 %r12, %r11;
-; SM70-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM70-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r4, %r13, %r12;
-; SM70-NEXT:    or.b32 %r14, %r4, %r2;
-; SM70-NEXT:    or.b32 %r15, %r4, %r3;
-; SM70-NEXT:    atom.relaxed.sys.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i32 @weak_acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: weak_acq_rel_acquire_i32_global(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [weak_acq_rel_acquire_i32_global_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [weak_acq_rel_acquire_i32_global_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [weak_acq_rel_acquire_i32_global_param_2];
-; SM70-NEXT:    atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_acq_rel_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: weak_acq_rel_acquire_i32_global_sys(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [weak_acq_rel_acquire_i32_global_sys_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [weak_acq_rel_acquire_i32_global_sys_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [weak_acq_rel_acquire_i32_global_sys_param_2];
-; SM70-NEXT:    atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_acq_rel_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: weak_acq_rel_acquire_i32_global_gpu(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [weak_acq_rel_acquire_i32_global_gpu_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [weak_acq_rel_acquire_i32_global_gpu_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [weak_acq_rel_acquire_i32_global_gpu_param_2];
-; SM70-NEXT:    atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i8 @weak_acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: weak_acq_rel_acquire_i8_generic_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<17>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [weak_acq_rel_acquire_i8_generic_cta_param_0];
-; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    ld.param.b8 %r6, [weak_acq_rel_acquire_i8_generic_cta_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r7, [weak_acq_rel_acquire_i8_generic_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM70-NEXT:    and.b32 %r9, %r8, 3;
-; SM70-NEXT:    shl.b32 %r1, %r9, 3;
-; SM70-NEXT:    mov.b32 %r10, 255;
-; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM70-NEXT:    not.b32 %r12, %r11;
-; SM70-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM70-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM70-NEXT:    ld.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r4, %r13, %r12;
-; SM70-NEXT:    or.b32 %r14, %r4, %r2;
-; SM70-NEXT:    or.b32 %r15, %r4, %r3;
-; SM70-NEXT:    atom.relaxed.cta.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i8 @weak_acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: weak_acq_rel_acquire_i8_shared_cta(
+define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_monotonic_i8_global_cta(
 ; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
 ; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [weak_acq_rel_acquire_i8_shared_cta_param_0];
-; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    ld.param.b8 %r6, [weak_acq_rel_acquire_i8_shared_cta_param_1];
+; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cta_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r7, [weak_acq_rel_acquire_i8_shared_cta_param_2];
+; SM70-NEXT:    ld.param.b8 %r7, [monotonic_monotonic_i8_global_cta_param_1];
 ; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
 ; SM70-NEXT:    and.b32 %r9, %r8, 3;
 ; SM70-NEXT:    shl.b32 %r1, %r9, 3;
 ; SM70-NEXT:    mov.b32 %r10, 255;
 ; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM70-NEXT:    not.b32 %r12, %r11;
-; SM70-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM70-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM70-NEXT:    ld.shared.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r4, %r13, %r12;
-; SM70-NEXT:    or.b32 %r14, %r4, %r2;
-; SM70-NEXT:    or.b32 %r15, %r4, %r3;
-; SM70-NEXT:    atom.relaxed.cta.shared.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i32 @weak_acq_rel_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: weak_acq_rel_acquire_i32_generic_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [weak_acq_rel_acquire_i32_generic_cta_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [weak_acq_rel_acquire_i32_generic_cta_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [weak_acq_rel_acquire_i32_generic_cta_param_2];
-; SM70-NEXT:    atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_acq_rel_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: weak_acq_rel_acquire_i32_shared_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [weak_acq_rel_acquire_i32_shared_cta_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [weak_acq_rel_acquire_i32_shared_cta_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [weak_acq_rel_acquire_i32_shared_cta_param_2];
-; SM70-NEXT:    atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i8 @strong_monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: strong_monotonic_monotonic_i8_global_cta(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b32 %r<18>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [strong_monotonic_monotonic_i8_global_cta_param_0];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r7, [strong_monotonic_monotonic_i8_global_cta_param_1];
-; SM70-NEXT:    ld.param.b8 %r8, [strong_monotonic_monotonic_i8_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r17, %r13, %r2;
-; SM70-NEXT:  $L__BB68_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
+; SM70-NEXT:  $L__BB0_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r14, %r17, %r3;
-; SM70-NEXT:    or.b32 %r15, %r17, %r4;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM70-NEXT:    @%p1 bra $L__BB68_3;
+; SM70-NEXT:    @%p1 bra $L__BB0_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB68_1 Depth=1
+; SM70-NEXT:    // in Loop: Header=BB0_1 Depth=1
 ; SM70-NEXT:    and.b32 %r6, %r5, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM70-NEXT:    mov.b32 %r17, %r6;
-; SM70-NEXT:    @%p2 bra $L__BB68_1;
-; SM70-NEXT:  $L__BB68_3: // %partword.cmpxchg.end
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
+; SM70-NEXT:    @%p2 bra $L__BB0_1;
+; SM70-NEXT:  $L__BB0_3: // %partword.cmpxchg.end
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic
+    ret i8 %new
 }
 
-define i8 @strong_monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: strong_monotonic_acquire_i8_global_cta(
+define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_acquire_i8_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b32 %r<18>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [strong_monotonic_acquire_i8_global_cta_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_global_cta_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r7, [strong_monotonic_acquire_i8_global_cta_param_1];
-; SM70-NEXT:    ld.param.b8 %r8, [strong_monotonic_acquire_i8_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM70-NEXT:    ld.param.b8 %r7, [monotonic_acquire_i8_global_cta_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 255;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r17, %r13, %r2;
-; SM70-NEXT:  $L__BB69_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
+; SM70-NEXT:  $L__BB1_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r14, %r17, %r3;
-; SM70-NEXT:    or.b32 %r15, %r17, %r4;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM70-NEXT:    @%p1 bra $L__BB69_3;
+; SM70-NEXT:    @%p1 bra $L__BB1_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB69_1 Depth=1
+; SM70-NEXT:    // in Loop: Header=BB1_1 Depth=1
 ; SM70-NEXT:    and.b32 %r6, %r5, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM70-NEXT:    mov.b32 %r17, %r6;
-; SM70-NEXT:    @%p2 bra $L__BB69_1;
-; SM70-NEXT:  $L__BB69_3: // %partword.cmpxchg.end
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
+; SM70-NEXT:    @%p2 bra $L__BB1_1;
+; SM70-NEXT:  $L__BB1_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire
+    ret i8 %new
 }
 
-define i8 @strong_monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: strong_monotonic_seq_cst_i8_global_cta(
+define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_seq_cst_i8_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b32 %r<18>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [strong_monotonic_seq_cst_i8_global_cta_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0];
 ; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b8 %r7, [strong_monotonic_seq_cst_i8_global_cta_param_1];
+; SM70-NEXT:    ld.param.b8 %r7, [monotonic_seq_cst_i8_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r8, [strong_monotonic_seq_cst_i8_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 255;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r17, %r13, %r2;
-; SM70-NEXT:  $L__BB70_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
+; SM70-NEXT:  $L__BB2_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r14, %r17, %r3;
-; SM70-NEXT:    or.b32 %r15, %r17, %r4;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM70-NEXT:    @%p1 bra $L__BB70_3;
+; SM70-NEXT:    @%p1 bra $L__BB2_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB70_1 Depth=1
+; SM70-NEXT:    // in Loop: Header=BB2_1 Depth=1
 ; SM70-NEXT:    and.b32 %r6, %r5, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM70-NEXT:    mov.b32 %r17, %r6;
-; SM70-NEXT:    @%p2 bra $L__BB70_1;
-; SM70-NEXT:  $L__BB70_3: // %partword.cmpxchg.end
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
+; SM70-NEXT:    @%p2 bra $L__BB2_1;
+; SM70-NEXT:  $L__BB2_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst
+    ret i8 %new
 }
 
-define i8 @strong_acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: strong_acquire_monotonic_i8_global_cta(
+define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_monotonic_i8_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b32 %r<18>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [strong_acquire_monotonic_i8_global_cta_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_global_cta_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r7, [strong_acquire_monotonic_i8_global_cta_param_1];
-; SM70-NEXT:    ld.param.b8 %r8, [strong_acquire_monotonic_i8_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM70-NEXT:    ld.param.b8 %r7, [acquire_monotonic_i8_global_cta_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 255;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r17, %r13, %r2;
-; SM70-NEXT:  $L__BB71_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
+; SM70-NEXT:  $L__BB3_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r14, %r17, %r3;
-; SM70-NEXT:    or.b32 %r15, %r17, %r4;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM70-NEXT:    @%p1 bra $L__BB71_3;
+; SM70-NEXT:    @%p1 bra $L__BB3_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB71_1 Depth=1
+; SM70-NEXT:    // in Loop: Header=BB3_1 Depth=1
 ; SM70-NEXT:    and.b32 %r6, %r5, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM70-NEXT:    mov.b32 %r17, %r6;
-; SM70-NEXT:    @%p2 bra $L__BB71_1;
-; SM70-NEXT:  $L__BB71_3: // %partword.cmpxchg.end
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
+; SM70-NEXT:    @%p2 bra $L__BB3_1;
+; SM70-NEXT:  $L__BB3_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic
+    ret i8 %new
 }
 
-define i8 @strong_acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: strong_acquire_acquire_i8_global_cta(
+define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_acquire_i8_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b32 %r<18>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [strong_acquire_acquire_i8_global_cta_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_global_cta_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r7, [strong_acquire_acquire_i8_global_cta_param_1];
-; SM70-NEXT:    ld.param.b8 %r8, [strong_acquire_acquire_i8_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM70-NEXT:    ld.param.b8 %r7, [acquire_acquire_i8_global_cta_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 255;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r17, %r13, %r2;
-; SM70-NEXT:  $L__BB72_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
+; SM70-NEXT:  $L__BB4_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r14, %r17, %r3;
-; SM70-NEXT:    or.b32 %r15, %r17, %r4;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM70-NEXT:    @%p1 bra $L__BB72_3;
+; SM70-NEXT:    @%p1 bra $L__BB4_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB72_1 Depth=1
+; SM70-NEXT:    // in Loop: Header=BB4_1 Depth=1
 ; SM70-NEXT:    and.b32 %r6, %r5, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM70-NEXT:    mov.b32 %r17, %r6;
-; SM70-NEXT:    @%p2 bra $L__BB72_1;
-; SM70-NEXT:  $L__BB72_3: // %partword.cmpxchg.end
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
+; SM70-NEXT:    @%p2 bra $L__BB4_1;
+; SM70-NEXT:  $L__BB4_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire
+    ret i8 %new
 }
 
-define i8 @strong_acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: strong_acquire_seq_cst_i8_global_cta(
+define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_seq_cst_i8_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b32 %r<18>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [strong_acquire_seq_cst_i8_global_cta_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cta_param_0];
 ; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b8 %r7, [strong_acquire_seq_cst_i8_global_cta_param_1];
+; SM70-NEXT:    ld.param.b8 %r7, [acquire_seq_cst_i8_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r8, [strong_acquire_seq_cst_i8_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 255;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r17, %r13, %r2;
-; SM70-NEXT:  $L__BB73_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
+; SM70-NEXT:  $L__BB5_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r14, %r17, %r3;
-; SM70-NEXT:    or.b32 %r15, %r17, %r4;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM70-NEXT:    @%p1 bra $L__BB73_3;
+; SM70-NEXT:    @%p1 bra $L__BB5_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB73_1 Depth=1
+; SM70-NEXT:    // in Loop: Header=BB5_1 Depth=1
 ; SM70-NEXT:    and.b32 %r6, %r5, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM70-NEXT:    mov.b32 %r17, %r6;
-; SM70-NEXT:    @%p2 bra $L__BB73_1;
-; SM70-NEXT:  $L__BB73_3: // %partword.cmpxchg.end
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
+; SM70-NEXT:    @%p2 bra $L__BB5_1;
+; SM70-NEXT:  $L__BB5_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst
+    ret i8 %new
 }
 
-define i8 @strong_release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: strong_release_monotonic_i8_global_cta(
+define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_monotonic_i8_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b32 %r<18>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [strong_release_monotonic_i8_global_cta_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_global_cta_param_0];
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    ld.param.b8 %r7, [strong_release_monotonic_i8_global_cta_param_1];
+; SM70-NEXT:    ld.param.b8 %r7, [release_monotonic_i8_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r8, [strong_release_monotonic_i8_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 255;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r17, %r13, %r2;
-; SM70-NEXT:  $L__BB74_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
+; SM70-NEXT:  $L__BB6_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r14, %r17, %r3;
-; SM70-NEXT:    or.b32 %r15, %r17, %r4;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM70-NEXT:    @%p1 bra $L__BB74_3;
+; SM70-NEXT:    @%p1 bra $L__BB6_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB74_1 Depth=1
+; SM70-NEXT:    // in Loop: Header=BB6_1 Depth=1
 ; SM70-NEXT:    and.b32 %r6, %r5, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM70-NEXT:    mov.b32 %r17, %r6;
-; SM70-NEXT:    @%p2 bra $L__BB74_1;
-; SM70-NEXT:  $L__BB74_3: // %partword.cmpxchg.end
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
+; SM70-NEXT:    @%p2 bra $L__BB6_1;
+; SM70-NEXT:  $L__BB6_3: // %partword.cmpxchg.end
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic
+    ret i8 %new
 }
 
-define i8 @strong_release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: strong_release_acquire_i8_global_cta(
+define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_acquire_i8_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b32 %r<18>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [strong_release_acquire_i8_global_cta_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_global_cta_param_0];
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    ld.param.b8 %r7, [strong_release_acquire_i8_global_cta_param_1];
+; SM70-NEXT:    ld.param.b8 %r7, [release_acquire_i8_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r8, [strong_release_acquire_i8_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 255;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r17, %r13, %r2;
-; SM70-NEXT:  $L__BB75_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
+; SM70-NEXT:  $L__BB7_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r14, %r17, %r3;
-; SM70-NEXT:    or.b32 %r15, %r17, %r4;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM70-NEXT:    @%p1 bra $L__BB75_3;
+; SM70-NEXT:    @%p1 bra $L__BB7_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB75_1 Depth=1
+; SM70-NEXT:    // in Loop: Header=BB7_1 Depth=1
 ; SM70-NEXT:    and.b32 %r6, %r5, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM70-NEXT:    mov.b32 %r17, %r6;
-; SM70-NEXT:    @%p2 bra $L__BB75_1;
-; SM70-NEXT:  $L__BB75_3: // %partword.cmpxchg.end
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
+; SM70-NEXT:    @%p2 bra $L__BB7_1;
+; SM70-NEXT:  $L__BB7_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire
+    ret i8 %new
 }
 
-define i8 @strong_release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: strong_release_seq_cst_i8_global_cta(
+define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_seq_cst_i8_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b32 %r<18>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [strong_release_seq_cst_i8_global_cta_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_global_cta_param_0];
 ; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b8 %r7, [strong_release_seq_cst_i8_global_cta_param_1];
+; SM70-NEXT:    ld.param.b8 %r7, [release_seq_cst_i8_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r8, [strong_release_seq_cst_i8_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 255;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r17, %r13, %r2;
-; SM70-NEXT:  $L__BB76_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
+; SM70-NEXT:  $L__BB8_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r14, %r17, %r3;
-; SM70-NEXT:    or.b32 %r15, %r17, %r4;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM70-NEXT:    @%p1 bra $L__BB76_3;
+; SM70-NEXT:    @%p1 bra $L__BB8_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB76_1 Depth=1
+; SM70-NEXT:    // in Loop: Header=BB8_1 Depth=1
 ; SM70-NEXT:    and.b32 %r6, %r5, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM70-NEXT:    mov.b32 %r17, %r6;
-; SM70-NEXT:    @%p2 bra $L__BB76_1;
-; SM70-NEXT:  $L__BB76_3: // %partword.cmpxchg.end
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
+; SM70-NEXT:    @%p2 bra $L__BB8_1;
+; SM70-NEXT:  $L__BB8_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst
+    ret i8 %new
 }
 
-define i8 @strong_acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: strong_acq_rel_monotonic_i8_global_cta(
+define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_monotonic_i8_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b32 %r<18>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [strong_acq_rel_monotonic_i8_global_cta_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0];
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    ld.param.b8 %r7, [strong_acq_rel_monotonic_i8_global_cta_param_1];
+; SM70-NEXT:    ld.param.b8 %r7, [acq_rel_monotonic_i8_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r8, [strong_acq_rel_monotonic_i8_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 255;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r17, %r13, %r2;
-; SM70-NEXT:  $L__BB77_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
+; SM70-NEXT:  $L__BB9_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r14, %r17, %r3;
-; SM70-NEXT:    or.b32 %r15, %r17, %r4;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM70-NEXT:    @%p1 bra $L__BB77_3;
+; SM70-NEXT:    @%p1 bra $L__BB9_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB77_1 Depth=1
+; SM70-NEXT:    // in Loop: Header=BB9_1 Depth=1
 ; SM70-NEXT:    and.b32 %r6, %r5, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM70-NEXT:    mov.b32 %r17, %r6;
-; SM70-NEXT:    @%p2 bra $L__BB77_1;
-; SM70-NEXT:  $L__BB77_3: // %partword.cmpxchg.end
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
+; SM70-NEXT:    @%p2 bra $L__BB9_1;
+; SM70-NEXT:  $L__BB9_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic
+    ret i8 %new
 }
 
-define i8 @strong_acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: strong_acq_rel_acquire_i8_global_cta(
+define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_acquire_i8_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b32 %r<18>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [strong_acq_rel_acquire_i8_global_cta_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cta_param_0];
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    ld.param.b8 %r7, [strong_acq_rel_acquire_i8_global_cta_param_1];
+; SM70-NEXT:    ld.param.b8 %r7, [acq_rel_acquire_i8_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r8, [strong_acq_rel_acquire_i8_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 255;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r17, %r13, %r2;
-; SM70-NEXT:  $L__BB78_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
+; SM70-NEXT:  $L__BB10_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r14, %r17, %r3;
-; SM70-NEXT:    or.b32 %r15, %r17, %r4;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM70-NEXT:    @%p1 bra $L__BB78_3;
+; SM70-NEXT:    @%p1 bra $L__BB10_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB78_1 Depth=1
+; SM70-NEXT:    // in Loop: Header=BB10_1 Depth=1
 ; SM70-NEXT:    and.b32 %r6, %r5, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM70-NEXT:    mov.b32 %r17, %r6;
-; SM70-NEXT:    @%p2 bra $L__BB78_1;
-; SM70-NEXT:  $L__BB78_3: // %partword.cmpxchg.end
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
+; SM70-NEXT:    @%p2 bra $L__BB10_1;
+; SM70-NEXT:  $L__BB10_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
+    ret i8 %new
 }
 
-define i8 @strong_acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: strong_acq_rel_seq_cst_i8_global_cta(
+define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i8_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b32 %r<18>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [strong_acq_rel_seq_cst_i8_global_cta_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0];
 ; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b8 %r7, [strong_acq_rel_seq_cst_i8_global_cta_param_1];
+; SM70-NEXT:    ld.param.b8 %r7, [acq_rel_seq_cst_i8_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r8, [strong_acq_rel_seq_cst_i8_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 255;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r17, %r13, %r2;
-; SM70-NEXT:  $L__BB79_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
+; SM70-NEXT:  $L__BB11_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r14, %r17, %r3;
-; SM70-NEXT:    or.b32 %r15, %r17, %r4;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM70-NEXT:    @%p1 bra $L__BB79_3;
+; SM70-NEXT:    @%p1 bra $L__BB11_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB79_1 Depth=1
+; SM70-NEXT:    // in Loop: Header=BB11_1 Depth=1
 ; SM70-NEXT:    and.b32 %r6, %r5, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM70-NEXT:    mov.b32 %r17, %r6;
-; SM70-NEXT:    @%p2 bra $L__BB79_1;
-; SM70-NEXT:  $L__BB79_3: // %partword.cmpxchg.end
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
+; SM70-NEXT:    @%p2 bra $L__BB11_1;
+; SM70-NEXT:  $L__BB11_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst
+    ret i8 %new
 }
 
-define i8 @strong_seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: strong_seq_cst_monotonic_i8_global_cta(
+define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_monotonic_i8_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b32 %r<18>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [strong_seq_cst_monotonic_i8_global_cta_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0];
 ; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b8 %r7, [strong_seq_cst_monotonic_i8_global_cta_param_1];
+; SM70-NEXT:    ld.param.b8 %r7, [seq_cst_monotonic_i8_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r8, [strong_seq_cst_monotonic_i8_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 255;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r17, %r13, %r2;
-; SM70-NEXT:  $L__BB80_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
+; SM70-NEXT:  $L__BB12_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r14, %r17, %r3;
-; SM70-NEXT:    or.b32 %r15, %r17, %r4;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM70-NEXT:    @%p1 bra $L__BB80_3;
+; SM70-NEXT:    @%p1 bra $L__BB12_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB80_1 Depth=1
+; SM70-NEXT:    // in Loop: Header=BB12_1 Depth=1
 ; SM70-NEXT:    and.b32 %r6, %r5, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM70-NEXT:    mov.b32 %r17, %r6;
-; SM70-NEXT:    @%p2 bra $L__BB80_1;
-; SM70-NEXT:  $L__BB80_3: // %partword.cmpxchg.end
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
+; SM70-NEXT:    @%p2 bra $L__BB12_1;
+; SM70-NEXT:  $L__BB12_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic
+    ret i8 %new
 }
 
-define i8 @strong_seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: strong_seq_cst_acquire_i8_global_cta(
+define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_acquire_i8_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b32 %r<18>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [strong_seq_cst_acquire_i8_global_cta_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cta_param_0];
 ; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b8 %r7, [strong_seq_cst_acquire_i8_global_cta_param_1];
+; SM70-NEXT:    ld.param.b8 %r7, [seq_cst_acquire_i8_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r8, [strong_seq_cst_acquire_i8_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 255;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r17, %r13, %r2;
-; SM70-NEXT:  $L__BB81_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
+; SM70-NEXT:  $L__BB13_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r14, %r17, %r3;
-; SM70-NEXT:    or.b32 %r15, %r17, %r4;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM70-NEXT:    @%p1 bra $L__BB81_3;
+; SM70-NEXT:    @%p1 bra $L__BB13_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB81_1 Depth=1
+; SM70-NEXT:    // in Loop: Header=BB13_1 Depth=1
 ; SM70-NEXT:    and.b32 %r6, %r5, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM70-NEXT:    mov.b32 %r17, %r6;
-; SM70-NEXT:    @%p2 bra $L__BB81_1;
-; SM70-NEXT:  $L__BB81_3: // %partword.cmpxchg.end
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
+; SM70-NEXT:    @%p2 bra $L__BB13_1;
+; SM70-NEXT:  $L__BB13_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire
+    ret i8 %new
 }
 
-define i8 @strong_seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: strong_seq_cst_seq_cst_i8_global_cta(
+define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i8_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b32 %r<18>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [strong_seq_cst_seq_cst_i8_global_cta_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0];
 ; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b8 %r7, [strong_seq_cst_seq_cst_i8_global_cta_param_1];
+; SM70-NEXT:    ld.param.b8 %r7, [seq_cst_seq_cst_i8_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r8, [strong_seq_cst_seq_cst_i8_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 255;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r17, %r13, %r2;
-; SM70-NEXT:  $L__BB82_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
+; SM70-NEXT:  $L__BB14_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r14, %r17, %r3;
-; SM70-NEXT:    or.b32 %r15, %r17, %r4;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM70-NEXT:    @%p1 bra $L__BB82_3;
+; SM70-NEXT:    @%p1 bra $L__BB14_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB82_1 Depth=1
+; SM70-NEXT:    // in Loop: Header=BB14_1 Depth=1
 ; SM70-NEXT:    and.b32 %r6, %r5, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM70-NEXT:    mov.b32 %r17, %r6;
-; SM70-NEXT:    @%p2 bra $L__BB82_1;
-; SM70-NEXT:  $L__BB82_3: // %partword.cmpxchg.end
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
+; SM70-NEXT:    @%p2 bra $L__BB14_1;
+; SM70-NEXT:  $L__BB14_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst
+    ret i8 %new
 }
 
-define i16 @strong_monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: strong_monotonic_monotonic_i16_global_cta(
+define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_monotonic_i16_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b32 %r<18>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [strong_monotonic_monotonic_i16_global_cta_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_global_cta_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r7, [strong_monotonic_monotonic_i16_global_cta_param_1];
-; SM70-NEXT:    ld.param.b16 %r8, [strong_monotonic_monotonic_i16_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 65535;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM70-NEXT:    ld.param.b16 %r7, [monotonic_monotonic_i16_global_cta_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 65535;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r17, %r13, %r2;
-; SM70-NEXT:  $L__BB83_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
+; SM70-NEXT:  $L__BB15_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r14, %r17, %r3;
-; SM70-NEXT:    or.b32 %r15, %r17, %r4;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM70-NEXT:    @%p1 bra $L__BB83_3;
+; SM70-NEXT:    @%p1 bra $L__BB15_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB83_1 Depth=1
+; SM70-NEXT:    // in Loop: Header=BB15_1 Depth=1
 ; SM70-NEXT:    and.b32 %r6, %r5, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM70-NEXT:    mov.b32 %r17, %r6;
-; SM70-NEXT:    @%p2 bra $L__BB83_1;
-; SM70-NEXT:  $L__BB83_3: // %partword.cmpxchg.end
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
+; SM70-NEXT:    @%p2 bra $L__BB15_1;
+; SM70-NEXT:  $L__BB15_3: // %partword.cmpxchg.end
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic
+    ret i16 %new
 }
 
-define i16 @strong_monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: strong_monotonic_acquire_i16_global_cta(
+define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_acquire_i16_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b32 %r<18>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [strong_monotonic_acquire_i16_global_cta_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_global_cta_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r7, [strong_monotonic_acquire_i16_global_cta_param_1];
-; SM70-NEXT:    ld.param.b16 %r8, [strong_monotonic_acquire_i16_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 65535;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM70-NEXT:    ld.param.b16 %r7, [monotonic_acquire_i16_global_cta_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 65535;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r17, %r13, %r2;
-; SM70-NEXT:  $L__BB84_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
+; SM70-NEXT:  $L__BB16_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r14, %r17, %r3;
-; SM70-NEXT:    or.b32 %r15, %r17, %r4;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM70-NEXT:    @%p1 bra $L__BB84_3;
+; SM70-NEXT:    @%p1 bra $L__BB16_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB84_1 Depth=1
+; SM70-NEXT:    // in Loop: Header=BB16_1 Depth=1
 ; SM70-NEXT:    and.b32 %r6, %r5, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM70-NEXT:    mov.b32 %r17, %r6;
-; SM70-NEXT:    @%p2 bra $L__BB84_1;
-; SM70-NEXT:  $L__BB84_3: // %partword.cmpxchg.end
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
+; SM70-NEXT:    @%p2 bra $L__BB16_1;
+; SM70-NEXT:  $L__BB16_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire
+    ret i16 %new
 }
 
-define i16 @strong_monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: strong_monotonic_seq_cst_i16_global_cta(
+define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_seq_cst_i16_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b32 %r<18>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [strong_monotonic_seq_cst_i16_global_cta_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0];
 ; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b16 %r7, [strong_monotonic_seq_cst_i16_global_cta_param_1];
+; SM70-NEXT:    ld.param.b16 %r7, [monotonic_seq_cst_i16_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r8, [strong_monotonic_seq_cst_i16_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 65535;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 65535;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r17, %r13, %r2;
-; SM70-NEXT:  $L__BB85_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
+; SM70-NEXT:  $L__BB17_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r14, %r17, %r3;
-; SM70-NEXT:    or.b32 %r15, %r17, %r4;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM70-NEXT:    @%p1 bra $L__BB85_3;
+; SM70-NEXT:    @%p1 bra $L__BB17_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB85_1 Depth=1
+; SM70-NEXT:    // in Loop: Header=BB17_1 Depth=1
 ; SM70-NEXT:    and.b32 %r6, %r5, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM70-NEXT:    mov.b32 %r17, %r6;
-; SM70-NEXT:    @%p2 bra $L__BB85_1;
-; SM70-NEXT:  $L__BB85_3: // %partword.cmpxchg.end
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
+; SM70-NEXT:    @%p2 bra $L__BB17_1;
+; SM70-NEXT:  $L__BB17_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst
+    ret i16 %new
 }
 
-define i16 @strong_acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: strong_acquire_monotonic_i16_global_cta(
+define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_monotonic_i16_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b32 %r<18>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [strong_acquire_monotonic_i16_global_cta_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_global_cta_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r7, [strong_acquire_monotonic_i16_global_cta_param_1];
-; SM70-NEXT:    ld.param.b16 %r8, [strong_acquire_monotonic_i16_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 65535;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM70-NEXT:    ld.param.b16 %r7, [acquire_monotonic_i16_global_cta_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 65535;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r17, %r13, %r2;
-; SM70-NEXT:  $L__BB86_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
+; SM70-NEXT:  $L__BB18_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r14, %r17, %r3;
-; SM70-NEXT:    or.b32 %r15, %r17, %r4;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM70-NEXT:    @%p1 bra $L__BB86_3;
+; SM70-NEXT:    @%p1 bra $L__BB18_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB86_1 Depth=1
+; SM70-NEXT:    // in Loop: Header=BB18_1 Depth=1
 ; SM70-NEXT:    and.b32 %r6, %r5, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM70-NEXT:    mov.b32 %r17, %r6;
-; SM70-NEXT:    @%p2 bra $L__BB86_1;
-; SM70-NEXT:  $L__BB86_3: // %partword.cmpxchg.end
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
+; SM70-NEXT:    @%p2 bra $L__BB18_1;
+; SM70-NEXT:  $L__BB18_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic
+    ret i16 %new
 }
 
-define i16 @strong_acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: strong_acquire_acquire_i16_global_cta(
+define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_acquire_i16_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b32 %r<18>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [strong_acquire_acquire_i16_global_cta_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_global_cta_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r7, [strong_acquire_acquire_i16_global_cta_param_1];
-; SM70-NEXT:    ld.param.b16 %r8, [strong_acquire_acquire_i16_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 65535;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM70-NEXT:    ld.param.b16 %r7, [acquire_acquire_i16_global_cta_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 65535;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r17, %r13, %r2;
-; SM70-NEXT:  $L__BB87_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
+; SM70-NEXT:  $L__BB19_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r14, %r17, %r3;
-; SM70-NEXT:    or.b32 %r15, %r17, %r4;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM70-NEXT:    @%p1 bra $L__BB87_3;
+; SM70-NEXT:    @%p1 bra $L__BB19_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB87_1 Depth=1
+; SM70-NEXT:    // in Loop: Header=BB19_1 Depth=1
 ; SM70-NEXT:    and.b32 %r6, %r5, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM70-NEXT:    mov.b32 %r17, %r6;
-; SM70-NEXT:    @%p2 bra $L__BB87_1;
-; SM70-NEXT:  $L__BB87_3: // %partword.cmpxchg.end
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
+; SM70-NEXT:    @%p2 bra $L__BB19_1;
+; SM70-NEXT:  $L__BB19_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire
+    ret i16 %new
 }
 
-define i16 @strong_acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: strong_acquire_seq_cst_i16_global_cta(
+define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_seq_cst_i16_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b32 %r<18>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [strong_acquire_seq_cst_i16_global_cta_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_global_cta_param_0];
 ; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b16 %r7, [strong_acquire_seq_cst_i16_global_cta_param_1];
+; SM70-NEXT:    ld.param.b16 %r7, [acquire_seq_cst_i16_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r8, [strong_acquire_seq_cst_i16_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 65535;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 65535;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r17, %r13, %r2;
-; SM70-NEXT:  $L__BB88_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
+; SM70-NEXT:  $L__BB20_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r14, %r17, %r3;
-; SM70-NEXT:    or.b32 %r15, %r17, %r4;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM70-NEXT:    @%p1 bra $L__BB88_3;
+; SM70-NEXT:    @%p1 bra $L__BB20_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB88_1 Depth=1
+; SM70-NEXT:    // in Loop: Header=BB20_1 Depth=1
 ; SM70-NEXT:    and.b32 %r6, %r5, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM70-NEXT:    mov.b32 %r17, %r6;
-; SM70-NEXT:    @%p2 bra $L__BB88_1;
-; SM70-NEXT:  $L__BB88_3: // %partword.cmpxchg.end
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
+; SM70-NEXT:    @%p2 bra $L__BB20_1;
+; SM70-NEXT:  $L__BB20_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst
+    ret i16 %new
 }
 
-define i16 @strong_release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: strong_release_monotonic_i16_global_cta(
+define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_monotonic_i16_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b32 %r<18>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [strong_release_monotonic_i16_global_cta_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_global_cta_param_0];
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    ld.param.b16 %r7, [strong_release_monotonic_i16_global_cta_param_1];
+; SM70-NEXT:    ld.param.b16 %r7, [release_monotonic_i16_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r8, [strong_release_monotonic_i16_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 65535;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 65535;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r17, %r13, %r2;
-; SM70-NEXT:  $L__BB89_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
+; SM70-NEXT:  $L__BB21_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r14, %r17, %r3;
-; SM70-NEXT:    or.b32 %r15, %r17, %r4;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM70-NEXT:    @%p1 bra $L__BB89_3;
+; SM70-NEXT:    @%p1 bra $L__BB21_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB89_1 Depth=1
+; SM70-NEXT:    // in Loop: Header=BB21_1 Depth=1
 ; SM70-NEXT:    and.b32 %r6, %r5, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM70-NEXT:    mov.b32 %r17, %r6;
-; SM70-NEXT:    @%p2 bra $L__BB89_1;
-; SM70-NEXT:  $L__BB89_3: // %partword.cmpxchg.end
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
+; SM70-NEXT:    @%p2 bra $L__BB21_1;
+; SM70-NEXT:  $L__BB21_3: // %partword.cmpxchg.end
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic
+    ret i16 %new
 }
 
-define i16 @strong_release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: strong_release_acquire_i16_global_cta(
+define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_acquire_i16_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b32 %r<18>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [strong_release_acquire_i16_global_cta_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_global_cta_param_0];
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    ld.param.b16 %r7, [strong_release_acquire_i16_global_cta_param_1];
+; SM70-NEXT:    ld.param.b16 %r7, [release_acquire_i16_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r8, [strong_release_acquire_i16_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 65535;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 65535;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r17, %r13, %r2;
-; SM70-NEXT:  $L__BB90_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
+; SM70-NEXT:  $L__BB22_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r14, %r17, %r3;
-; SM70-NEXT:    or.b32 %r15, %r17, %r4;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM70-NEXT:    @%p1 bra $L__BB90_3;
+; SM70-NEXT:    @%p1 bra $L__BB22_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB90_1 Depth=1
+; SM70-NEXT:    // in Loop: Header=BB22_1 Depth=1
 ; SM70-NEXT:    and.b32 %r6, %r5, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM70-NEXT:    mov.b32 %r17, %r6;
-; SM70-NEXT:    @%p2 bra $L__BB90_1;
-; SM70-NEXT:  $L__BB90_3: // %partword.cmpxchg.end
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
+; SM70-NEXT:    @%p2 bra $L__BB22_1;
+; SM70-NEXT:  $L__BB22_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release acquire
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release acquire
+    ret i16 %new
 }
 
-define i16 @strong_release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: strong_release_seq_cst_i16_global_cta(
+define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_seq_cst_i16_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b32 %r<18>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [strong_release_seq_cst_i16_global_cta_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_global_cta_param_0];
 ; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b16 %r7, [strong_release_seq_cst_i16_global_cta_param_1];
+; SM70-NEXT:    ld.param.b16 %r7, [release_seq_cst_i16_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r8, [strong_release_seq_cst_i16_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 65535;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 65535;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r17, %r13, %r2;
-; SM70-NEXT:  $L__BB91_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
+; SM70-NEXT:  $L__BB23_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r14, %r17, %r3;
-; SM70-NEXT:    or.b32 %r15, %r17, %r4;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM70-NEXT:    @%p1 bra $L__BB91_3;
+; SM70-NEXT:    @%p1 bra $L__BB23_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB91_1 Depth=1
+; SM70-NEXT:    // in Loop: Header=BB23_1 Depth=1
 ; SM70-NEXT:    and.b32 %r6, %r5, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM70-NEXT:    mov.b32 %r17, %r6;
-; SM70-NEXT:    @%p2 bra $L__BB91_1;
-; SM70-NEXT:  $L__BB91_3: // %partword.cmpxchg.end
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
+; SM70-NEXT:    @%p2 bra $L__BB23_1;
+; SM70-NEXT:  $L__BB23_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst
+    ret i16 %new
 }
 
-define i16 @strong_acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: strong_acq_rel_monotonic_i16_global_cta(
+define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_monotonic_i16_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b32 %r<18>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [strong_acq_rel_monotonic_i16_global_cta_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0];
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    ld.param.b16 %r7, [strong_acq_rel_monotonic_i16_global_cta_param_1];
+; SM70-NEXT:    ld.param.b16 %r7, [acq_rel_monotonic_i16_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r8, [strong_acq_rel_monotonic_i16_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 65535;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 65535;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r17, %r13, %r2;
-; SM70-NEXT:  $L__BB92_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
+; SM70-NEXT:  $L__BB24_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r14, %r17, %r3;
-; SM70-NEXT:    or.b32 %r15, %r17, %r4;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM70-NEXT:    @%p1 bra $L__BB92_3;
+; SM70-NEXT:    @%p1 bra $L__BB24_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB92_1 Depth=1
+; SM70-NEXT:    // in Loop: Header=BB24_1 Depth=1
 ; SM70-NEXT:    and.b32 %r6, %r5, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM70-NEXT:    mov.b32 %r17, %r6;
-; SM70-NEXT:    @%p2 bra $L__BB92_1;
-; SM70-NEXT:  $L__BB92_3: // %partword.cmpxchg.end
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
+; SM70-NEXT:    @%p2 bra $L__BB24_1;
+; SM70-NEXT:  $L__BB24_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic
+    ret i16 %new
 }
 
-define i16 @strong_acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: strong_acq_rel_acquire_i16_global_cta(
+define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_acquire_i16_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b32 %r<18>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [strong_acq_rel_acquire_i16_global_cta_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_global_cta_param_0];
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    ld.param.b16 %r7, [strong_acq_rel_acquire_i16_global_cta_param_1];
+; SM70-NEXT:    ld.param.b16 %r7, [acq_rel_acquire_i16_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r8, [strong_acq_rel_acquire_i16_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 65535;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 65535;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r17, %r13, %r2;
-; SM70-NEXT:  $L__BB93_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
+; SM70-NEXT:  $L__BB25_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r14, %r17, %r3;
-; SM70-NEXT:    or.b32 %r15, %r17, %r4;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM70-NEXT:    @%p1 bra $L__BB93_3;
+; SM70-NEXT:    @%p1 bra $L__BB25_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB93_1 Depth=1
+; SM70-NEXT:    // in Loop: Header=BB25_1 Depth=1
 ; SM70-NEXT:    and.b32 %r6, %r5, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM70-NEXT:    mov.b32 %r17, %r6;
-; SM70-NEXT:    @%p2 bra $L__BB93_1;
-; SM70-NEXT:  $L__BB93_3: // %partword.cmpxchg.end
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
+; SM70-NEXT:    @%p2 bra $L__BB25_1;
+; SM70-NEXT:  $L__BB25_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire
+    ret i16 %new
 }
 
-define i16 @strong_acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: strong_acq_rel_seq_cst_i16_global_cta(
+define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i16_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b32 %r<18>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [strong_acq_rel_seq_cst_i16_global_cta_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0];
 ; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b16 %r7, [strong_acq_rel_seq_cst_i16_global_cta_param_1];
+; SM70-NEXT:    ld.param.b16 %r7, [acq_rel_seq_cst_i16_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r8, [strong_acq_rel_seq_cst_i16_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 65535;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 65535;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r17, %r13, %r2;
-; SM70-NEXT:  $L__BB94_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
+; SM70-NEXT:  $L__BB26_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r14, %r17, %r3;
-; SM70-NEXT:    or.b32 %r15, %r17, %r4;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM70-NEXT:    @%p1 bra $L__BB94_3;
+; SM70-NEXT:    @%p1 bra $L__BB26_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB94_1 Depth=1
+; SM70-NEXT:    // in Loop: Header=BB26_1 Depth=1
 ; SM70-NEXT:    and.b32 %r6, %r5, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM70-NEXT:    mov.b32 %r17, %r6;
-; SM70-NEXT:    @%p2 bra $L__BB94_1;
-; SM70-NEXT:  $L__BB94_3: // %partword.cmpxchg.end
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
+; SM70-NEXT:    @%p2 bra $L__BB26_1;
+; SM70-NEXT:  $L__BB26_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst
+    ret i16 %new
 }
 
-define i16 @strong_seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: strong_seq_cst_monotonic_i16_global_cta(
+define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_monotonic_i16_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b32 %r<18>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [strong_seq_cst_monotonic_i16_global_cta_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0];
 ; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b16 %r7, [strong_seq_cst_monotonic_i16_global_cta_param_1];
+; SM70-NEXT:    ld.param.b16 %r7, [seq_cst_monotonic_i16_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r8, [strong_seq_cst_monotonic_i16_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 65535;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 65535;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r17, %r13, %r2;
-; SM70-NEXT:  $L__BB95_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
+; SM70-NEXT:  $L__BB27_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r14, %r17, %r3;
-; SM70-NEXT:    or.b32 %r15, %r17, %r4;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM70-NEXT:    @%p1 bra $L__BB95_3;
+; SM70-NEXT:    @%p1 bra $L__BB27_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB95_1 Depth=1
+; SM70-NEXT:    // in Loop: Header=BB27_1 Depth=1
 ; SM70-NEXT:    and.b32 %r6, %r5, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM70-NEXT:    mov.b32 %r17, %r6;
-; SM70-NEXT:    @%p2 bra $L__BB95_1;
-; SM70-NEXT:  $L__BB95_3: // %partword.cmpxchg.end
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
+; SM70-NEXT:    @%p2 bra $L__BB27_1;
+; SM70-NEXT:  $L__BB27_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic
+    ret i16 %new
 }
 
-define i16 @strong_seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: strong_seq_cst_acquire_i16_global_cta(
+define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_acquire_i16_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b32 %r<18>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [strong_seq_cst_acquire_i16_global_cta_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_global_cta_param_0];
 ; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b16 %r7, [strong_seq_cst_acquire_i16_global_cta_param_1];
+; SM70-NEXT:    ld.param.b16 %r7, [seq_cst_acquire_i16_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r8, [strong_seq_cst_acquire_i16_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 65535;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 65535;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r17, %r13, %r2;
-; SM70-NEXT:  $L__BB96_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
+; SM70-NEXT:  $L__BB28_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r14, %r17, %r3;
-; SM70-NEXT:    or.b32 %r15, %r17, %r4;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM70-NEXT:    @%p1 bra $L__BB96_3;
+; SM70-NEXT:    @%p1 bra $L__BB28_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB96_1 Depth=1
+; SM70-NEXT:    // in Loop: Header=BB28_1 Depth=1
 ; SM70-NEXT:    and.b32 %r6, %r5, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM70-NEXT:    mov.b32 %r17, %r6;
-; SM70-NEXT:    @%p2 bra $L__BB96_1;
-; SM70-NEXT:  $L__BB96_3: // %partword.cmpxchg.end
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
+; SM70-NEXT:    @%p2 bra $L__BB28_1;
+; SM70-NEXT:  $L__BB28_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire
+    ret i16 %new
 }
 
-define i16 @strong_seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: strong_seq_cst_seq_cst_i16_global_cta(
+define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i16_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b32 %r<18>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [strong_seq_cst_seq_cst_i16_global_cta_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0];
 ; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b16 %r7, [strong_seq_cst_seq_cst_i16_global_cta_param_1];
+; SM70-NEXT:    ld.param.b16 %r7, [seq_cst_seq_cst_i16_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r8, [strong_seq_cst_seq_cst_i16_global_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 65535;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 65535;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r17, %r13, %r2;
-; SM70-NEXT:  $L__BB97_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
+; SM70-NEXT:  $L__BB29_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r14, %r17, %r3;
-; SM70-NEXT:    or.b32 %r15, %r17, %r4;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM70-NEXT:    @%p1 bra $L__BB97_3;
+; SM70-NEXT:    @%p1 bra $L__BB29_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB97_1 Depth=1
+; SM70-NEXT:    // in Loop: Header=BB29_1 Depth=1
 ; SM70-NEXT:    and.b32 %r6, %r5, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM70-NEXT:    mov.b32 %r17, %r6;
-; SM70-NEXT:    @%p2 bra $L__BB97_1;
-; SM70-NEXT:  $L__BB97_3: // %partword.cmpxchg.end
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
+; SM70-NEXT:    @%p2 bra $L__BB29_1;
+; SM70-NEXT:  $L__BB29_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst
+    ret i16 %new
 }
 
-define i32 @strong_monotonic_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: strong_monotonic_monotonic_i32_global_cta(
+define i32 @monotonic_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_monotonic_i32_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b32 %r<4>;
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [strong_monotonic_monotonic_i32_global_cta_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [strong_monotonic_monotonic_i32_global_cta_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [strong_monotonic_monotonic_i32_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_global_cta_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_global_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_global_cta_param_2];
 ; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic
+    ret i32 %new
 }
 
-define i32 @strong_monotonic_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: strong_monotonic_acquire_i32_global_cta(
+define i32 @monotonic_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_acquire_i32_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b32 %r<4>;
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [strong_monotonic_acquire_i32_global_cta_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [strong_monotonic_acquire_i32_global_cta_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [strong_monotonic_acquire_i32_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_global_cta_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_global_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_global_cta_param_2];
 ; SM70-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire
+    ret i32 %new
 }
 
-define i32 @strong_monotonic_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: strong_monotonic_seq_cst_i32_global_cta(
+define i32 @monotonic_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_seq_cst_i32_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b32 %r<4>;
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [strong_monotonic_seq_cst_i32_global_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_cta_param_0];
 ; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b32 %r1, [strong_monotonic_seq_cst_i32_global_cta_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [strong_monotonic_seq_cst_i32_global_cta_param_2];
+; SM70-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_global_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_global_cta_param_2];
 ; SM70-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst
+    ret i32 %new
 }
 
-define i32 @strong_acquire_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: strong_acquire_monotonic_i32_global_cta(
+define i32 @acquire_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_monotonic_i32_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b32 %r<4>;
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [strong_acquire_monotonic_i32_global_cta_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [strong_acquire_monotonic_i32_global_cta_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [strong_acquire_monotonic_i32_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_global_cta_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_global_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_global_cta_param_2];
 ; SM70-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic
+    ret i32 %new
 }
 
-define i32 @strong_acquire_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: strong_acquire_acquire_i32_global_cta(
+define i32 @acquire_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_acquire_i32_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b32 %r<4>;
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [strong_acquire_acquire_i32_global_cta_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [strong_acquire_acquire_i32_global_cta_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [strong_acquire_acquire_i32_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_global_cta_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_global_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_global_cta_param_2];
 ; SM70-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire
+    ret i32 %new
 }
 
-define i32 @strong_acquire_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: strong_acquire_seq_cst_i32_global_cta(
+define i32 @acquire_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_seq_cst_i32_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b32 %r<4>;
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [strong_acquire_seq_cst_i32_global_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_global_cta_param_0];
 ; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b32 %r1, [strong_acquire_seq_cst_i32_global_cta_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [strong_acquire_seq_cst_i32_global_cta_param_2];
+; SM70-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_global_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_global_cta_param_2];
 ; SM70-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst
+    ret i32 %new
 }
 
-define i32 @strong_release_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: strong_release_monotonic_i32_global_cta(
+define i32 @release_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_monotonic_i32_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b32 %r<4>;
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [strong_release_monotonic_i32_global_cta_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [strong_release_monotonic_i32_global_cta_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [strong_release_monotonic_i32_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_global_cta_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_global_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_global_cta_param_2];
 ; SM70-NEXT:    atom.release.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic
+    ret i32 %new
 }
 
-define i32 @strong_release_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: strong_release_acquire_i32_global_cta(
+define i32 @release_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_acquire_i32_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b32 %r<4>;
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [strong_release_acquire_i32_global_cta_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [strong_release_acquire_i32_global_cta_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [strong_release_acquire_i32_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_global_cta_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [release_acquire_i32_global_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [release_acquire_i32_global_cta_param_2];
 ; SM70-NEXT:    atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release acquire
+    ret i32 %new
 }
 
-define i32 @strong_release_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: strong_release_seq_cst_i32_global_cta(
+define i32 @release_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_seq_cst_i32_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b32 %r<4>;
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [strong_release_seq_cst_i32_global_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_global_cta_param_0];
 ; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b32 %r1, [strong_release_seq_cst_i32_global_cta_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [strong_release_seq_cst_i32_global_cta_param_2];
+; SM70-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_global_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_global_cta_param_2];
 ; SM70-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst
+    ret i32 %new
 }
 
-define i32 @strong_acq_rel_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: strong_acq_rel_monotonic_i32_global_cta(
+define i32 @acq_rel_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_monotonic_i32_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b32 %r<4>;
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [strong_acq_rel_monotonic_i32_global_cta_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [strong_acq_rel_monotonic_i32_global_cta_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [strong_acq_rel_monotonic_i32_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_cta_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_global_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_global_cta_param_2];
 ; SM70-NEXT:    atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic
+    ret i32 %new
 }
 
-define i32 @strong_acq_rel_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: strong_acq_rel_acquire_i32_global_cta(
+define i32 @acq_rel_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_acquire_i32_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b32 %r<4>;
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [strong_acq_rel_acquire_i32_global_cta_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [strong_acq_rel_acquire_i32_global_cta_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [strong_acq_rel_acquire_i32_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_global_cta_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_global_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_global_cta_param_2];
 ; SM70-NEXT:    atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
+    ret i32 %new
 }
 
-define i32 @strong_acq_rel_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: strong_acq_rel_seq_cst_i32_global_cta(
+define i32 @acq_rel_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i32_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b32 %r<4>;
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [strong_acq_rel_seq_cst_i32_global_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_cta_param_0];
 ; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b32 %r1, [strong_acq_rel_seq_cst_i32_global_cta_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [strong_acq_rel_seq_cst_i32_global_cta_param_2];
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_cta_param_2];
 ; SM70-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst
+    ret i32 %new
 }
 
-define i32 @strong_seq_cst_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: strong_seq_cst_monotonic_i32_global_cta(
+define i32 @seq_cst_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_monotonic_i32_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b32 %r<4>;
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [strong_seq_cst_monotonic_i32_global_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_cta_param_0];
 ; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b32 %r1, [strong_seq_cst_monotonic_i32_global_cta_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [strong_seq_cst_monotonic_i32_global_cta_param_2];
+; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_global_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_global_cta_param_2];
 ; SM70-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic
+    ret i32 %new
 }
 
-define i32 @strong_seq_cst_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: strong_seq_cst_acquire_i32_global_cta(
+define i32 @seq_cst_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_acquire_i32_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b32 %r<4>;
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [strong_seq_cst_acquire_i32_global_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_global_cta_param_0];
 ; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b32 %r1, [strong_seq_cst_acquire_i32_global_cta_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [strong_seq_cst_acquire_i32_global_cta_param_2];
+; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_global_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_global_cta_param_2];
 ; SM70-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire
+    ret i32 %new
 }
 
-define i32 @strong_seq_cst_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: strong_seq_cst_seq_cst_i32_global_cta(
+define i32 @seq_cst_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i32_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b32 %r<4>;
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [strong_seq_cst_seq_cst_i32_global_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_cta_param_0];
 ; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b32 %r1, [strong_seq_cst_seq_cst_i32_global_cta_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [strong_seq_cst_seq_cst_i32_global_cta_param_2];
+; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_cta_param_2];
 ; SM70-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst
+    ret i32 %new
 }
 
-define i64 @strong_monotonic_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: strong_monotonic_monotonic_i64_global_cta(
+define i64 @monotonic_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_monotonic_i64_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [strong_monotonic_monotonic_i64_global_cta_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [strong_monotonic_monotonic_i64_global_cta_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [strong_monotonic_monotonic_i64_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_global_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_global_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_global_cta_param_2];
 ; SM70-NEXT:    atom.relaxed.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd4;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic
+    ret i64 %new
 }
 
-define i64 @strong_monotonic_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: strong_monotonic_acquire_i64_global_cta(
+define i64 @monotonic_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_acquire_i64_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [strong_monotonic_acquire_i64_global_cta_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [strong_monotonic_acquire_i64_global_cta_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [strong_monotonic_acquire_i64_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_global_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_global_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_global_cta_param_2];
 ; SM70-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd4;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire
+    ret i64 %new
 }
 
-define i64 @strong_monotonic_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: strong_monotonic_seq_cst_i64_global_cta(
+define i64 @monotonic_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_seq_cst_i64_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [strong_monotonic_seq_cst_i64_global_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_cta_param_0];
 ; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b64 %rd2, [strong_monotonic_seq_cst_i64_global_cta_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [strong_monotonic_seq_cst_i64_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_cta_param_2];
 ; SM70-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd4;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst
+    ret i64 %new
 }
 
-define i64 @strong_acquire_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: strong_acquire_monotonic_i64_global_cta(
+define i64 @acquire_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_monotonic_i64_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [strong_acquire_monotonic_i64_global_cta_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [strong_acquire_monotonic_i64_global_cta_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [strong_acquire_monotonic_i64_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_global_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_global_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_global_cta_param_2];
 ; SM70-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd4;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic
+    ret i64 %new
 }
 
-define i64 @strong_acquire_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: strong_acquire_acquire_i64_global_cta(
+define i64 @acquire_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_acquire_i64_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [strong_acquire_acquire_i64_global_cta_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [strong_acquire_acquire_i64_global_cta_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [strong_acquire_acquire_i64_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_global_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_global_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_global_cta_param_2];
 ; SM70-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd4;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire
+    ret i64 %new
 }
 
-define i64 @strong_acquire_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: strong_acquire_seq_cst_i64_global_cta(
+define i64 @acquire_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_seq_cst_i64_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [strong_acquire_seq_cst_i64_global_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_global_cta_param_0];
 ; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b64 %rd2, [strong_acquire_seq_cst_i64_global_cta_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [strong_acquire_seq_cst_i64_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_global_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_global_cta_param_2];
 ; SM70-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd4;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst
+    ret i64 %new
 }
 
-define i64 @strong_release_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: strong_release_monotonic_i64_global_cta(
+define i64 @release_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_monotonic_i64_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [strong_release_monotonic_i64_global_cta_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [strong_release_monotonic_i64_global_cta_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [strong_release_monotonic_i64_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_global_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_global_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_global_cta_param_2];
 ; SM70-NEXT:    atom.release.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd4;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic
+    ret i64 %new
 }
 
-define i64 @strong_release_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: strong_release_acquire_i64_global_cta(
+define i64 @release_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_acquire_i64_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [strong_release_acquire_i64_global_cta_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [strong_release_acquire_i64_global_cta_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [strong_release_acquire_i64_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_global_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_global_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_global_cta_param_2];
 ; SM70-NEXT:    atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd4;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release acquire
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release acquire
+    ret i64 %new
 }
 
-define i64 @strong_release_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: strong_release_seq_cst_i64_global_cta(
+define i64 @release_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_seq_cst_i64_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [strong_release_seq_cst_i64_global_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_global_cta_param_0];
 ; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b64 %rd2, [strong_release_seq_cst_i64_global_cta_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [strong_release_seq_cst_i64_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_global_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_global_cta_param_2];
 ; SM70-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd4;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst
+    ret i64 %new
 }
 
-define i64 @strong_acq_rel_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: strong_acq_rel_monotonic_i64_global_cta(
+define i64 @acq_rel_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_monotonic_i64_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [strong_acq_rel_monotonic_i64_global_cta_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [strong_acq_rel_monotonic_i64_global_cta_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [strong_acq_rel_monotonic_i64_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_cta_param_2];
 ; SM70-NEXT:    atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd4;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic
+    ret i64 %new
 }
 
-define i64 @strong_acq_rel_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: strong_acq_rel_acquire_i64_global_cta(
+define i64 @acq_rel_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_acquire_i64_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [strong_acq_rel_acquire_i64_global_cta_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [strong_acq_rel_acquire_i64_global_cta_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [strong_acq_rel_acquire_i64_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_global_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_global_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_global_cta_param_2];
 ; SM70-NEXT:    atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd4;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire
+    ret i64 %new
 }
 
-define i64 @strong_acq_rel_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: strong_acq_rel_seq_cst_i64_global_cta(
+define i64 @acq_rel_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i64_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [strong_acq_rel_seq_cst_i64_global_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_cta_param_0];
 ; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b64 %rd2, [strong_acq_rel_seq_cst_i64_global_cta_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [strong_acq_rel_seq_cst_i64_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_cta_param_2];
 ; SM70-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd4;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst
+    ret i64 %new
 }
 
-define i64 @strong_seq_cst_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: strong_seq_cst_monotonic_i64_global_cta(
+define i64 @seq_cst_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_monotonic_i64_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [strong_seq_cst_monotonic_i64_global_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_cta_param_0];
 ; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b64 %rd2, [strong_seq_cst_monotonic_i64_global_cta_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [strong_seq_cst_monotonic_i64_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_cta_param_2];
 ; SM70-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd4;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic
+    ret i64 %new
 }
 
-define i64 @strong_seq_cst_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: strong_seq_cst_acquire_i64_global_cta(
+define i64 @seq_cst_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_acquire_i64_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [strong_seq_cst_acquire_i64_global_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_global_cta_param_0];
 ; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b64 %rd2, [strong_seq_cst_acquire_i64_global_cta_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [strong_seq_cst_acquire_i64_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_global_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_global_cta_param_2];
 ; SM70-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd4;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire
+    ret i64 %new
 }
 
-define i64 @strong_seq_cst_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: strong_seq_cst_seq_cst_i64_global_cta(
+define i64 @seq_cst_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i64_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [strong_seq_cst_seq_cst_i64_global_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_cta_param_0];
 ; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b64 %rd2, [strong_seq_cst_seq_cst_i64_global_cta_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [strong_seq_cst_seq_cst_i64_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_cta_param_2];
 ; SM70-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd4;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst
+    ret i64 %new
 }
 
-define i8 @strong_acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: strong_acq_rel_acquire_i8_global(
+define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_acquire_i8_global(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b32 %r<18>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [strong_acq_rel_acquire_i8_global_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0];
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.b8 %r7, [strong_acq_rel_acquire_i8_global_param_1];
+; SM70-NEXT:    ld.param.b8 %r7, [acq_rel_acquire_i8_global_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r8, [strong_acq_rel_acquire_i8_global_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 255;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r17, %r13, %r2;
-; SM70-NEXT:  $L__BB128_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
+; SM70-NEXT:  $L__BB60_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r14, %r17, %r3;
-; SM70-NEXT:    or.b32 %r15, %r17, %r4;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM70-NEXT:    atom.relaxed.sys.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM70-NEXT:    @%p1 bra $L__BB128_3;
+; SM70-NEXT:    @%p1 bra $L__BB60_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB128_1 Depth=1
+; SM70-NEXT:    // in Loop: Header=BB60_1 Depth=1
 ; SM70-NEXT:    and.b32 %r6, %r5, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM70-NEXT:    mov.b32 %r17, %r6;
-; SM70-NEXT:    @%p2 bra $L__BB128_1;
-; SM70-NEXT:  $L__BB128_3: // %partword.cmpxchg.end
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
+; SM70-NEXT:    @%p2 bra $L__BB60_1;
+; SM70-NEXT:  $L__BB60_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire
+    ret i8 %new
 }
 
-define i32 @strong_acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: strong_acq_rel_acquire_i32_global(
+define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_acquire_i32_global(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b32 %r<4>;
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [strong_acq_rel_acquire_i32_global_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [strong_acq_rel_acquire_i32_global_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [strong_acq_rel_acquire_i32_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_global_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_global_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_global_param_2];
 ; SM70-NEXT:    atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire
+    ret i32 %new
 }
 
-define i32 @strong_acq_rel_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: strong_acq_rel_acquire_i32_global_sys(
+define i32 @acq_rel_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_acquire_i32_global_sys(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b32 %r<4>;
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [strong_acq_rel_acquire_i32_global_sys_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [strong_acq_rel_acquire_i32_global_sys_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [strong_acq_rel_acquire_i32_global_sys_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_global_sys_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_global_sys_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_global_sys_param_2];
 ; SM70-NEXT:    atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire
+    ret i32 %new
 }
 
-define i32 @strong_acq_rel_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: strong_acq_rel_acquire_i32_global_gpu(
+define i32 @acq_rel_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_acquire_i32_global_gpu(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b32 %r<4>;
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [strong_acq_rel_acquire_i32_global_gpu_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [strong_acq_rel_acquire_i32_global_gpu_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [strong_acq_rel_acquire_i32_global_gpu_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_global_gpu_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_global_gpu_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_global_gpu_param_2];
 ; SM70-NEXT:    atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire
+    ret i32 %new
 }
 
-define i8 @strong_acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: strong_acq_rel_acquire_i8_generic_cta(
+define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_acquire_i8_generic_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b32 %r<18>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [strong_acq_rel_acquire_i8_generic_cta_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0];
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    ld.param.b8 %r7, [strong_acq_rel_acquire_i8_generic_cta_param_1];
+; SM70-NEXT:    ld.param.b8 %r7, [acq_rel_acquire_i8_generic_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r8, [strong_acq_rel_acquire_i8_generic_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 255;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM70-NEXT:    ld.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r17, %r13, %r2;
-; SM70-NEXT:  $L__BB132_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
+; SM70-NEXT:  $L__BB64_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r14, %r17, %r3;
-; SM70-NEXT:    or.b32 %r15, %r17, %r4;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM70-NEXT:    atom.relaxed.cta.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM70-NEXT:    @%p1 bra $L__BB132_3;
+; SM70-NEXT:    @%p1 bra $L__BB64_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB132_1 Depth=1
+; SM70-NEXT:    // in Loop: Header=BB64_1 Depth=1
 ; SM70-NEXT:    and.b32 %r6, %r5, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM70-NEXT:    mov.b32 %r17, %r6;
-; SM70-NEXT:    @%p2 bra $L__BB132_1;
-; SM70-NEXT:  $L__BB132_3: // %partword.cmpxchg.end
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
+; SM70-NEXT:    @%p2 bra $L__BB64_1;
+; SM70-NEXT:  $L__BB64_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
+    ret i8 %new
 }
 
-define i8 @strong_acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: strong_acq_rel_acquire_i8_shared_cta(
+define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_acquire_i8_shared_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b32 %r<18>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd2, [strong_acq_rel_acquire_i8_shared_cta_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0];
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    ld.param.b8 %r7, [strong_acq_rel_acquire_i8_shared_cta_param_1];
+; SM70-NEXT:    ld.param.b8 %r7, [acq_rel_acquire_i8_shared_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r8, [strong_acq_rel_acquire_i8_shared_cta_param_2];
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 255;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM70-NEXT:    ld.shared.b32 %r13, [%rd1];
-; SM70-NEXT:    and.b32 %r17, %r13, %r2;
-; SM70-NEXT:  $L__BB133_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
+; SM70-NEXT:  $L__BB65_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r14, %r17, %r3;
-; SM70-NEXT:    or.b32 %r15, %r17, %r4;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM70-NEXT:    atom.relaxed.cta.shared.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM70-NEXT:    @%p1 bra $L__BB133_3;
+; SM70-NEXT:    @%p1 bra $L__BB65_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB133_1 Depth=1
+; SM70-NEXT:    // in Loop: Header=BB65_1 Depth=1
 ; SM70-NEXT:    and.b32 %r6, %r5, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM70-NEXT:    mov.b32 %r17, %r6;
-; SM70-NEXT:    @%p2 bra $L__BB133_1;
-; SM70-NEXT:  $L__BB133_3: // %partword.cmpxchg.end
-; SM70-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
+; SM70-NEXT:    @%p2 bra $L__BB65_1;
+; SM70-NEXT:  $L__BB65_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
+    ret i8 %new
 }
 
-define i32 @strong_acq_rel_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: strong_acq_rel_acquire_i32_generic_cta(
+define i32 @acq_rel_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_acquire_i32_generic_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b32 %r<4>;
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [strong_acq_rel_acquire_i32_generic_cta_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [strong_acq_rel_acquire_i32_generic_cta_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [strong_acq_rel_acquire_i32_generic_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_cta_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_generic_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_generic_cta_param_2];
 ; SM70-NEXT:    atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
+    ret i32 %new
 }
 
-define i32 @strong_acq_rel_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: strong_acq_rel_acquire_i32_shared_cta(
+define i32 @acq_rel_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_acquire_i32_shared_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b32 %r<4>;
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [strong_acq_rel_acquire_i32_shared_cta_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [strong_acq_rel_acquire_i32_shared_cta_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [strong_acq_rel_acquire_i32_shared_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_cta_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_shared_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_shared_cta_param_2];
 ; SM70-NEXT:    atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
+    ret i32 %new
 }
 

diff  --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll
index f206697bee006..4cdedb2065e23 100644
--- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll
@@ -1,3914 +1,2121 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | FileCheck %s --check-prefix=SM90
-; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify -arch=sm_90 %}
+; RUN: %if ptxas-sm_90 && ptxas-isa-8.7 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify -arch=sm_90 %}
 
-define i8 @weak_monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: weak_monotonic_monotonic_i8_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<17>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [weak_monotonic_monotonic_i8_global_cta_param_0];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r6, [weak_monotonic_monotonic_i8_global_cta_param_1];
-; SM90-NEXT:    ld.param.b8 %r7, [weak_monotonic_monotonic_i8_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM90-NEXT:    and.b32 %r9, %r8, 3;
-; SM90-NEXT:    shl.b32 %r1, %r9, 3;
-; SM90-NEXT:    mov.b32 %r10, 255;
-; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM90-NEXT:    not.b32 %r12, %r11;
-; SM90-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM90-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r4, %r13, %r12;
-; SM90-NEXT:    or.b32 %r14, %r4, %r2;
-; SM90-NEXT:    or.b32 %r15, %r4, %r3;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i8 @weak_monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: weak_monotonic_acquire_i8_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<17>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [weak_monotonic_acquire_i8_global_cta_param_0];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r6, [weak_monotonic_acquire_i8_global_cta_param_1];
-; SM90-NEXT:    ld.param.b8 %r7, [weak_monotonic_acquire_i8_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM90-NEXT:    and.b32 %r9, %r8, 3;
-; SM90-NEXT:    shl.b32 %r1, %r9, 3;
-; SM90-NEXT:    mov.b32 %r10, 255;
-; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM90-NEXT:    not.b32 %r12, %r11;
-; SM90-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM90-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r4, %r13, %r12;
-; SM90-NEXT:    or.b32 %r14, %r4, %r2;
-; SM90-NEXT:    or.b32 %r15, %r4, %r3;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i8 @weak_monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: weak_monotonic_seq_cst_i8_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<17>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [weak_monotonic_seq_cst_i8_global_cta_param_0];
-; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b8 %r6, [weak_monotonic_seq_cst_i8_global_cta_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r7, [weak_monotonic_seq_cst_i8_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM90-NEXT:    and.b32 %r9, %r8, 3;
-; SM90-NEXT:    shl.b32 %r1, %r9, 3;
-; SM90-NEXT:    mov.b32 %r10, 255;
-; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM90-NEXT:    not.b32 %r12, %r11;
-; SM90-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM90-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r4, %r13, %r12;
-; SM90-NEXT:    or.b32 %r14, %r4, %r2;
-; SM90-NEXT:    or.b32 %r15, %r4, %r3;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i8 @weak_acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: weak_acquire_monotonic_i8_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<17>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [weak_acquire_monotonic_i8_global_cta_param_0];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r6, [weak_acquire_monotonic_i8_global_cta_param_1];
-; SM90-NEXT:    ld.param.b8 %r7, [weak_acquire_monotonic_i8_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM90-NEXT:    and.b32 %r9, %r8, 3;
-; SM90-NEXT:    shl.b32 %r1, %r9, 3;
-; SM90-NEXT:    mov.b32 %r10, 255;
-; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM90-NEXT:    not.b32 %r12, %r11;
-; SM90-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM90-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r4, %r13, %r12;
-; SM90-NEXT:    or.b32 %r14, %r4, %r2;
-; SM90-NEXT:    or.b32 %r15, %r4, %r3;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i8 @weak_acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: weak_acquire_acquire_i8_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<17>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [weak_acquire_acquire_i8_global_cta_param_0];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r6, [weak_acquire_acquire_i8_global_cta_param_1];
-; SM90-NEXT:    ld.param.b8 %r7, [weak_acquire_acquire_i8_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM90-NEXT:    and.b32 %r9, %r8, 3;
-; SM90-NEXT:    shl.b32 %r1, %r9, 3;
-; SM90-NEXT:    mov.b32 %r10, 255;
-; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM90-NEXT:    not.b32 %r12, %r11;
-; SM90-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM90-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r4, %r13, %r12;
-; SM90-NEXT:    or.b32 %r14, %r4, %r2;
-; SM90-NEXT:    or.b32 %r15, %r4, %r3;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i8 @weak_acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: weak_acquire_seq_cst_i8_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<17>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [weak_acquire_seq_cst_i8_global_cta_param_0];
-; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b8 %r6, [weak_acquire_seq_cst_i8_global_cta_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r7, [weak_acquire_seq_cst_i8_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM90-NEXT:    and.b32 %r9, %r8, 3;
-; SM90-NEXT:    shl.b32 %r1, %r9, 3;
-; SM90-NEXT:    mov.b32 %r10, 255;
-; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM90-NEXT:    not.b32 %r12, %r11;
-; SM90-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM90-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r4, %r13, %r12;
-; SM90-NEXT:    or.b32 %r14, %r4, %r2;
-; SM90-NEXT:    or.b32 %r15, %r4, %r3;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i8 @weak_release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: weak_release_monotonic_i8_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<17>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [weak_release_monotonic_i8_global_cta_param_0];
-; SM90-NEXT:    fence.release.cta;
-; SM90-NEXT:    ld.param.b8 %r6, [weak_release_monotonic_i8_global_cta_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r7, [weak_release_monotonic_i8_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM90-NEXT:    and.b32 %r9, %r8, 3;
-; SM90-NEXT:    shl.b32 %r1, %r9, 3;
-; SM90-NEXT:    mov.b32 %r10, 255;
-; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM90-NEXT:    not.b32 %r12, %r11;
-; SM90-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM90-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r4, %r13, %r12;
-; SM90-NEXT:    or.b32 %r14, %r4, %r2;
-; SM90-NEXT:    or.b32 %r15, %r4, %r3;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i8 @weak_release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: weak_release_acquire_i8_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<17>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [weak_release_acquire_i8_global_cta_param_0];
-; SM90-NEXT:    fence.release.cta;
-; SM90-NEXT:    ld.param.b8 %r6, [weak_release_acquire_i8_global_cta_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r7, [weak_release_acquire_i8_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM90-NEXT:    and.b32 %r9, %r8, 3;
-; SM90-NEXT:    shl.b32 %r1, %r9, 3;
-; SM90-NEXT:    mov.b32 %r10, 255;
-; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM90-NEXT:    not.b32 %r12, %r11;
-; SM90-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM90-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r4, %r13, %r12;
-; SM90-NEXT:    or.b32 %r14, %r4, %r2;
-; SM90-NEXT:    or.b32 %r15, %r4, %r3;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i8 @weak_release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: weak_release_seq_cst_i8_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<17>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [weak_release_seq_cst_i8_global_cta_param_0];
-; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b8 %r6, [weak_release_seq_cst_i8_global_cta_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r7, [weak_release_seq_cst_i8_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM90-NEXT:    and.b32 %r9, %r8, 3;
-; SM90-NEXT:    shl.b32 %r1, %r9, 3;
-; SM90-NEXT:    mov.b32 %r10, 255;
-; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM90-NEXT:    not.b32 %r12, %r11;
-; SM90-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM90-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r4, %r13, %r12;
-; SM90-NEXT:    or.b32 %r14, %r4, %r2;
-; SM90-NEXT:    or.b32 %r15, %r4, %r3;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i8 @weak_acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: weak_acq_rel_monotonic_i8_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<17>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [weak_acq_rel_monotonic_i8_global_cta_param_0];
-; SM90-NEXT:    fence.release.cta;
-; SM90-NEXT:    ld.param.b8 %r6, [weak_acq_rel_monotonic_i8_global_cta_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r7, [weak_acq_rel_monotonic_i8_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM90-NEXT:    and.b32 %r9, %r8, 3;
-; SM90-NEXT:    shl.b32 %r1, %r9, 3;
-; SM90-NEXT:    mov.b32 %r10, 255;
-; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM90-NEXT:    not.b32 %r12, %r11;
-; SM90-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM90-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r4, %r13, %r12;
-; SM90-NEXT:    or.b32 %r14, %r4, %r2;
-; SM90-NEXT:    or.b32 %r15, %r4, %r3;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i8 @weak_acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: weak_acq_rel_acquire_i8_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<17>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [weak_acq_rel_acquire_i8_global_cta_param_0];
-; SM90-NEXT:    fence.release.cta;
-; SM90-NEXT:    ld.param.b8 %r6, [weak_acq_rel_acquire_i8_global_cta_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r7, [weak_acq_rel_acquire_i8_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM90-NEXT:    and.b32 %r9, %r8, 3;
-; SM90-NEXT:    shl.b32 %r1, %r9, 3;
-; SM90-NEXT:    mov.b32 %r10, 255;
-; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM90-NEXT:    not.b32 %r12, %r11;
-; SM90-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM90-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r4, %r13, %r12;
-; SM90-NEXT:    or.b32 %r14, %r4, %r2;
-; SM90-NEXT:    or.b32 %r15, %r4, %r3;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i8 @weak_acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: weak_acq_rel_seq_cst_i8_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<17>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [weak_acq_rel_seq_cst_i8_global_cta_param_0];
-; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b8 %r6, [weak_acq_rel_seq_cst_i8_global_cta_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r7, [weak_acq_rel_seq_cst_i8_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM90-NEXT:    and.b32 %r9, %r8, 3;
-; SM90-NEXT:    shl.b32 %r1, %r9, 3;
-; SM90-NEXT:    mov.b32 %r10, 255;
-; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM90-NEXT:    not.b32 %r12, %r11;
-; SM90-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM90-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r4, %r13, %r12;
-; SM90-NEXT:    or.b32 %r14, %r4, %r2;
-; SM90-NEXT:    or.b32 %r15, %r4, %r3;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i8 @weak_seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: weak_seq_cst_monotonic_i8_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<17>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [weak_seq_cst_monotonic_i8_global_cta_param_0];
-; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b8 %r6, [weak_seq_cst_monotonic_i8_global_cta_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r7, [weak_seq_cst_monotonic_i8_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM90-NEXT:    and.b32 %r9, %r8, 3;
-; SM90-NEXT:    shl.b32 %r1, %r9, 3;
-; SM90-NEXT:    mov.b32 %r10, 255;
-; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM90-NEXT:    not.b32 %r12, %r11;
-; SM90-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM90-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r4, %r13, %r12;
-; SM90-NEXT:    or.b32 %r14, %r4, %r2;
-; SM90-NEXT:    or.b32 %r15, %r4, %r3;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i8 @weak_seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: weak_seq_cst_acquire_i8_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<17>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [weak_seq_cst_acquire_i8_global_cta_param_0];
-; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b8 %r6, [weak_seq_cst_acquire_i8_global_cta_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r7, [weak_seq_cst_acquire_i8_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM90-NEXT:    and.b32 %r9, %r8, 3;
-; SM90-NEXT:    shl.b32 %r1, %r9, 3;
-; SM90-NEXT:    mov.b32 %r10, 255;
-; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM90-NEXT:    not.b32 %r12, %r11;
-; SM90-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM90-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r4, %r13, %r12;
-; SM90-NEXT:    or.b32 %r14, %r4, %r2;
-; SM90-NEXT:    or.b32 %r15, %r4, %r3;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i8 @weak_seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: weak_seq_cst_seq_cst_i8_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<17>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [weak_seq_cst_seq_cst_i8_global_cta_param_0];
-; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b8 %r6, [weak_seq_cst_seq_cst_i8_global_cta_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r7, [weak_seq_cst_seq_cst_i8_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM90-NEXT:    and.b32 %r9, %r8, 3;
-; SM90-NEXT:    shl.b32 %r1, %r9, 3;
-; SM90-NEXT:    mov.b32 %r10, 255;
-; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM90-NEXT:    not.b32 %r12, %r11;
-; SM90-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM90-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r4, %r13, %r12;
-; SM90-NEXT:    or.b32 %r14, %r4, %r2;
-; SM90-NEXT:    or.b32 %r15, %r4, %r3;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i16 @weak_monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: weak_monotonic_monotonic_i16_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<17>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [weak_monotonic_monotonic_i16_global_cta_param_0];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r6, [weak_monotonic_monotonic_i16_global_cta_param_1];
-; SM90-NEXT:    ld.param.b16 %r7, [weak_monotonic_monotonic_i16_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM90-NEXT:    and.b32 %r9, %r8, 3;
-; SM90-NEXT:    shl.b32 %r1, %r9, 3;
-; SM90-NEXT:    mov.b32 %r10, 65535;
-; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM90-NEXT:    not.b32 %r12, %r11;
-; SM90-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM90-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r4, %r13, %r12;
-; SM90-NEXT:    or.b32 %r14, %r4, %r2;
-; SM90-NEXT:    or.b32 %r15, %r4, %r3;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
-}
-
-define i16 @weak_monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: weak_monotonic_acquire_i16_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<17>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [weak_monotonic_acquire_i16_global_cta_param_0];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r6, [weak_monotonic_acquire_i16_global_cta_param_1];
-; SM90-NEXT:    ld.param.b16 %r7, [weak_monotonic_acquire_i16_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM90-NEXT:    and.b32 %r9, %r8, 3;
-; SM90-NEXT:    shl.b32 %r1, %r9, 3;
-; SM90-NEXT:    mov.b32 %r10, 65535;
-; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM90-NEXT:    not.b32 %r12, %r11;
-; SM90-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM90-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r4, %r13, %r12;
-; SM90-NEXT:    or.b32 %r14, %r4, %r2;
-; SM90-NEXT:    or.b32 %r15, %r4, %r3;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
-}
-
-define i16 @weak_monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: weak_monotonic_seq_cst_i16_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<17>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [weak_monotonic_seq_cst_i16_global_cta_param_0];
-; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b16 %r6, [weak_monotonic_seq_cst_i16_global_cta_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r7, [weak_monotonic_seq_cst_i16_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM90-NEXT:    and.b32 %r9, %r8, 3;
-; SM90-NEXT:    shl.b32 %r1, %r9, 3;
-; SM90-NEXT:    mov.b32 %r10, 65535;
-; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM90-NEXT:    not.b32 %r12, %r11;
-; SM90-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM90-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r4, %r13, %r12;
-; SM90-NEXT:    or.b32 %r14, %r4, %r2;
-; SM90-NEXT:    or.b32 %r15, %r4, %r3;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
-}
-
-define i16 @weak_acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: weak_acquire_monotonic_i16_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<17>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [weak_acquire_monotonic_i16_global_cta_param_0];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r6, [weak_acquire_monotonic_i16_global_cta_param_1];
-; SM90-NEXT:    ld.param.b16 %r7, [weak_acquire_monotonic_i16_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM90-NEXT:    and.b32 %r9, %r8, 3;
-; SM90-NEXT:    shl.b32 %r1, %r9, 3;
-; SM90-NEXT:    mov.b32 %r10, 65535;
-; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM90-NEXT:    not.b32 %r12, %r11;
-; SM90-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM90-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r4, %r13, %r12;
-; SM90-NEXT:    or.b32 %r14, %r4, %r2;
-; SM90-NEXT:    or.b32 %r15, %r4, %r3;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
-}
-
-define i16 @weak_acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: weak_acquire_acquire_i16_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<17>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [weak_acquire_acquire_i16_global_cta_param_0];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r6, [weak_acquire_acquire_i16_global_cta_param_1];
-; SM90-NEXT:    ld.param.b16 %r7, [weak_acquire_acquire_i16_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM90-NEXT:    and.b32 %r9, %r8, 3;
-; SM90-NEXT:    shl.b32 %r1, %r9, 3;
-; SM90-NEXT:    mov.b32 %r10, 65535;
-; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM90-NEXT:    not.b32 %r12, %r11;
-; SM90-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM90-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r4, %r13, %r12;
-; SM90-NEXT:    or.b32 %r14, %r4, %r2;
-; SM90-NEXT:    or.b32 %r15, %r4, %r3;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
-}
-
-define i16 @weak_acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: weak_acquire_seq_cst_i16_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<17>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [weak_acquire_seq_cst_i16_global_cta_param_0];
-; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b16 %r6, [weak_acquire_seq_cst_i16_global_cta_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r7, [weak_acquire_seq_cst_i16_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM90-NEXT:    and.b32 %r9, %r8, 3;
-; SM90-NEXT:    shl.b32 %r1, %r9, 3;
-; SM90-NEXT:    mov.b32 %r10, 65535;
-; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM90-NEXT:    not.b32 %r12, %r11;
-; SM90-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM90-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r4, %r13, %r12;
-; SM90-NEXT:    or.b32 %r14, %r4, %r2;
-; SM90-NEXT:    or.b32 %r15, %r4, %r3;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
-}
-
-define i16 @weak_release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: weak_release_monotonic_i16_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<17>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [weak_release_monotonic_i16_global_cta_param_0];
-; SM90-NEXT:    fence.release.cta;
-; SM90-NEXT:    ld.param.b16 %r6, [weak_release_monotonic_i16_global_cta_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r7, [weak_release_monotonic_i16_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM90-NEXT:    and.b32 %r9, %r8, 3;
-; SM90-NEXT:    shl.b32 %r1, %r9, 3;
-; SM90-NEXT:    mov.b32 %r10, 65535;
-; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM90-NEXT:    not.b32 %r12, %r11;
-; SM90-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM90-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r4, %r13, %r12;
-; SM90-NEXT:    or.b32 %r14, %r4, %r2;
-; SM90-NEXT:    or.b32 %r15, %r4, %r3;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
-}
-
-define i16 @weak_release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: weak_release_acquire_i16_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<17>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [weak_release_acquire_i16_global_cta_param_0];
-; SM90-NEXT:    fence.release.cta;
-; SM90-NEXT:    ld.param.b16 %r6, [weak_release_acquire_i16_global_cta_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r7, [weak_release_acquire_i16_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM90-NEXT:    and.b32 %r9, %r8, 3;
-; SM90-NEXT:    shl.b32 %r1, %r9, 3;
-; SM90-NEXT:    mov.b32 %r10, 65535;
-; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM90-NEXT:    not.b32 %r12, %r11;
-; SM90-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM90-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r4, %r13, %r12;
-; SM90-NEXT:    or.b32 %r14, %r4, %r2;
-; SM90-NEXT:    or.b32 %r15, %r4, %r3;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release acquire
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
-}
-
-define i16 @weak_release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: weak_release_seq_cst_i16_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<17>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [weak_release_seq_cst_i16_global_cta_param_0];
-; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b16 %r6, [weak_release_seq_cst_i16_global_cta_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r7, [weak_release_seq_cst_i16_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM90-NEXT:    and.b32 %r9, %r8, 3;
-; SM90-NEXT:    shl.b32 %r1, %r9, 3;
-; SM90-NEXT:    mov.b32 %r10, 65535;
-; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM90-NEXT:    not.b32 %r12, %r11;
-; SM90-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM90-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r4, %r13, %r12;
-; SM90-NEXT:    or.b32 %r14, %r4, %r2;
-; SM90-NEXT:    or.b32 %r15, %r4, %r3;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
-}
-
-define i16 @weak_acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: weak_acq_rel_monotonic_i16_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<17>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [weak_acq_rel_monotonic_i16_global_cta_param_0];
-; SM90-NEXT:    fence.release.cta;
-; SM90-NEXT:    ld.param.b16 %r6, [weak_acq_rel_monotonic_i16_global_cta_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r7, [weak_acq_rel_monotonic_i16_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM90-NEXT:    and.b32 %r9, %r8, 3;
-; SM90-NEXT:    shl.b32 %r1, %r9, 3;
-; SM90-NEXT:    mov.b32 %r10, 65535;
-; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM90-NEXT:    not.b32 %r12, %r11;
-; SM90-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM90-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r4, %r13, %r12;
-; SM90-NEXT:    or.b32 %r14, %r4, %r2;
-; SM90-NEXT:    or.b32 %r15, %r4, %r3;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
-}
-
-define i16 @weak_acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: weak_acq_rel_acquire_i16_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<17>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [weak_acq_rel_acquire_i16_global_cta_param_0];
-; SM90-NEXT:    fence.release.cta;
-; SM90-NEXT:    ld.param.b16 %r6, [weak_acq_rel_acquire_i16_global_cta_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r7, [weak_acq_rel_acquire_i16_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM90-NEXT:    and.b32 %r9, %r8, 3;
-; SM90-NEXT:    shl.b32 %r1, %r9, 3;
-; SM90-NEXT:    mov.b32 %r10, 65535;
-; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM90-NEXT:    not.b32 %r12, %r11;
-; SM90-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM90-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r4, %r13, %r12;
-; SM90-NEXT:    or.b32 %r14, %r4, %r2;
-; SM90-NEXT:    or.b32 %r15, %r4, %r3;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
-}
-
-define i16 @weak_acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: weak_acq_rel_seq_cst_i16_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<17>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [weak_acq_rel_seq_cst_i16_global_cta_param_0];
-; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b16 %r6, [weak_acq_rel_seq_cst_i16_global_cta_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r7, [weak_acq_rel_seq_cst_i16_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM90-NEXT:    and.b32 %r9, %r8, 3;
-; SM90-NEXT:    shl.b32 %r1, %r9, 3;
-; SM90-NEXT:    mov.b32 %r10, 65535;
-; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM90-NEXT:    not.b32 %r12, %r11;
-; SM90-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM90-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r4, %r13, %r12;
-; SM90-NEXT:    or.b32 %r14, %r4, %r2;
-; SM90-NEXT:    or.b32 %r15, %r4, %r3;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
-}
-
-define i16 @weak_seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: weak_seq_cst_monotonic_i16_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<17>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [weak_seq_cst_monotonic_i16_global_cta_param_0];
-; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b16 %r6, [weak_seq_cst_monotonic_i16_global_cta_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r7, [weak_seq_cst_monotonic_i16_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM90-NEXT:    and.b32 %r9, %r8, 3;
-; SM90-NEXT:    shl.b32 %r1, %r9, 3;
-; SM90-NEXT:    mov.b32 %r10, 65535;
-; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM90-NEXT:    not.b32 %r12, %r11;
-; SM90-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM90-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r4, %r13, %r12;
-; SM90-NEXT:    or.b32 %r14, %r4, %r2;
-; SM90-NEXT:    or.b32 %r15, %r4, %r3;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
-}
-
-define i16 @weak_seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: weak_seq_cst_acquire_i16_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<17>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [weak_seq_cst_acquire_i16_global_cta_param_0];
-; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b16 %r6, [weak_seq_cst_acquire_i16_global_cta_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r7, [weak_seq_cst_acquire_i16_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM90-NEXT:    and.b32 %r9, %r8, 3;
-; SM90-NEXT:    shl.b32 %r1, %r9, 3;
-; SM90-NEXT:    mov.b32 %r10, 65535;
-; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM90-NEXT:    not.b32 %r12, %r11;
-; SM90-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM90-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r4, %r13, %r12;
-; SM90-NEXT:    or.b32 %r14, %r4, %r2;
-; SM90-NEXT:    or.b32 %r15, %r4, %r3;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
-}
-
-define i16 @weak_seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: weak_seq_cst_seq_cst_i16_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<17>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [weak_seq_cst_seq_cst_i16_global_cta_param_0];
-; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b16 %r6, [weak_seq_cst_seq_cst_i16_global_cta_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r7, [weak_seq_cst_seq_cst_i16_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM90-NEXT:    and.b32 %r9, %r8, 3;
-; SM90-NEXT:    shl.b32 %r1, %r9, 3;
-; SM90-NEXT:    mov.b32 %r10, 65535;
-; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM90-NEXT:    not.b32 %r12, %r11;
-; SM90-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM90-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r4, %r13, %r12;
-; SM90-NEXT:    or.b32 %r14, %r4, %r2;
-; SM90-NEXT:    or.b32 %r15, %r4, %r3;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
-}
-
-define i32 @weak_monotonic_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: weak_monotonic_monotonic_i32_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [weak_monotonic_monotonic_i32_global_cta_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [weak_monotonic_monotonic_i32_global_cta_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [weak_monotonic_monotonic_i32_global_cta_param_2];
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_monotonic_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: weak_monotonic_acquire_i32_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [weak_monotonic_acquire_i32_global_cta_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [weak_monotonic_acquire_i32_global_cta_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [weak_monotonic_acquire_i32_global_cta_param_2];
-; SM90-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_monotonic_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: weak_monotonic_seq_cst_i32_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [weak_monotonic_seq_cst_i32_global_cta_param_0];
-; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b32 %r1, [weak_monotonic_seq_cst_i32_global_cta_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [weak_monotonic_seq_cst_i32_global_cta_param_2];
-; SM90-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_acquire_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: weak_acquire_monotonic_i32_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [weak_acquire_monotonic_i32_global_cta_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [weak_acquire_monotonic_i32_global_cta_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [weak_acquire_monotonic_i32_global_cta_param_2];
-; SM90-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_acquire_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: weak_acquire_acquire_i32_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [weak_acquire_acquire_i32_global_cta_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [weak_acquire_acquire_i32_global_cta_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [weak_acquire_acquire_i32_global_cta_param_2];
-; SM90-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_acquire_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: weak_acquire_seq_cst_i32_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [weak_acquire_seq_cst_i32_global_cta_param_0];
-; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b32 %r1, [weak_acquire_seq_cst_i32_global_cta_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [weak_acquire_seq_cst_i32_global_cta_param_2];
-; SM90-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_release_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: weak_release_monotonic_i32_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [weak_release_monotonic_i32_global_cta_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [weak_release_monotonic_i32_global_cta_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [weak_release_monotonic_i32_global_cta_param_2];
-; SM90-NEXT:    atom.release.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_release_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: weak_release_acquire_i32_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [weak_release_acquire_i32_global_cta_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [weak_release_acquire_i32_global_cta_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [weak_release_acquire_i32_global_cta_param_2];
-; SM90-NEXT:    atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_release_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: weak_release_seq_cst_i32_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [weak_release_seq_cst_i32_global_cta_param_0];
-; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b32 %r1, [weak_release_seq_cst_i32_global_cta_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [weak_release_seq_cst_i32_global_cta_param_2];
-; SM90-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_acq_rel_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: weak_acq_rel_monotonic_i32_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [weak_acq_rel_monotonic_i32_global_cta_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [weak_acq_rel_monotonic_i32_global_cta_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [weak_acq_rel_monotonic_i32_global_cta_param_2];
-; SM90-NEXT:    atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_acq_rel_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: weak_acq_rel_acquire_i32_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [weak_acq_rel_acquire_i32_global_cta_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [weak_acq_rel_acquire_i32_global_cta_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [weak_acq_rel_acquire_i32_global_cta_param_2];
-; SM90-NEXT:    atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_acq_rel_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: weak_acq_rel_seq_cst_i32_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [weak_acq_rel_seq_cst_i32_global_cta_param_0];
-; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b32 %r1, [weak_acq_rel_seq_cst_i32_global_cta_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [weak_acq_rel_seq_cst_i32_global_cta_param_2];
-; SM90-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_seq_cst_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: weak_seq_cst_monotonic_i32_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [weak_seq_cst_monotonic_i32_global_cta_param_0];
-; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b32 %r1, [weak_seq_cst_monotonic_i32_global_cta_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [weak_seq_cst_monotonic_i32_global_cta_param_2];
-; SM90-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_seq_cst_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: weak_seq_cst_acquire_i32_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [weak_seq_cst_acquire_i32_global_cta_param_0];
-; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b32 %r1, [weak_seq_cst_acquire_i32_global_cta_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [weak_seq_cst_acquire_i32_global_cta_param_2];
-; SM90-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_seq_cst_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: weak_seq_cst_seq_cst_i32_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [weak_seq_cst_seq_cst_i32_global_cta_param_0];
-; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b32 %r1, [weak_seq_cst_seq_cst_i32_global_cta_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [weak_seq_cst_seq_cst_i32_global_cta_param_2];
-; SM90-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i64 @weak_monotonic_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: weak_monotonic_monotonic_i64_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b64 %rd<5>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [weak_monotonic_monotonic_i64_global_cta_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [weak_monotonic_monotonic_i64_global_cta_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [weak_monotonic_monotonic_i64_global_cta_param_2];
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd4;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
-}
-
-define i64 @weak_monotonic_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: weak_monotonic_acquire_i64_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b64 %rd<5>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [weak_monotonic_acquire_i64_global_cta_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [weak_monotonic_acquire_i64_global_cta_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [weak_monotonic_acquire_i64_global_cta_param_2];
-; SM90-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd4;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
-}
-
-define i64 @weak_monotonic_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: weak_monotonic_seq_cst_i64_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b64 %rd<5>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [weak_monotonic_seq_cst_i64_global_cta_param_0];
-; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b64 %rd2, [weak_monotonic_seq_cst_i64_global_cta_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [weak_monotonic_seq_cst_i64_global_cta_param_2];
-; SM90-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd4;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
-}
-
-define i64 @weak_acquire_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: weak_acquire_monotonic_i64_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b64 %rd<5>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [weak_acquire_monotonic_i64_global_cta_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [weak_acquire_monotonic_i64_global_cta_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [weak_acquire_monotonic_i64_global_cta_param_2];
-; SM90-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd4;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
-}
-
-define i64 @weak_acquire_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: weak_acquire_acquire_i64_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b64 %rd<5>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [weak_acquire_acquire_i64_global_cta_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [weak_acquire_acquire_i64_global_cta_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [weak_acquire_acquire_i64_global_cta_param_2];
-; SM90-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd4;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
-}
-
-define i64 @weak_acquire_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: weak_acquire_seq_cst_i64_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b64 %rd<5>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [weak_acquire_seq_cst_i64_global_cta_param_0];
-; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b64 %rd2, [weak_acquire_seq_cst_i64_global_cta_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [weak_acquire_seq_cst_i64_global_cta_param_2];
-; SM90-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd4;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
-}
-
-define i64 @weak_release_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: weak_release_monotonic_i64_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b64 %rd<5>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [weak_release_monotonic_i64_global_cta_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [weak_release_monotonic_i64_global_cta_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [weak_release_monotonic_i64_global_cta_param_2];
-; SM90-NEXT:    atom.release.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd4;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
-}
-
-define i64 @weak_release_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: weak_release_acquire_i64_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b64 %rd<5>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [weak_release_acquire_i64_global_cta_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [weak_release_acquire_i64_global_cta_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [weak_release_acquire_i64_global_cta_param_2];
-; SM90-NEXT:    atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd4;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release acquire
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
-}
-
-define i64 @weak_release_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: weak_release_seq_cst_i64_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b64 %rd<5>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [weak_release_seq_cst_i64_global_cta_param_0];
-; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b64 %rd2, [weak_release_seq_cst_i64_global_cta_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [weak_release_seq_cst_i64_global_cta_param_2];
-; SM90-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd4;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
-}
-
-define i64 @weak_acq_rel_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: weak_acq_rel_monotonic_i64_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b64 %rd<5>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [weak_acq_rel_monotonic_i64_global_cta_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [weak_acq_rel_monotonic_i64_global_cta_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [weak_acq_rel_monotonic_i64_global_cta_param_2];
-; SM90-NEXT:    atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd4;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
-}
-
-define i64 @weak_acq_rel_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: weak_acq_rel_acquire_i64_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b64 %rd<5>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [weak_acq_rel_acquire_i64_global_cta_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [weak_acq_rel_acquire_i64_global_cta_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [weak_acq_rel_acquire_i64_global_cta_param_2];
-; SM90-NEXT:    atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd4;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
-}
-
-define i64 @weak_acq_rel_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: weak_acq_rel_seq_cst_i64_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b64 %rd<5>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [weak_acq_rel_seq_cst_i64_global_cta_param_0];
-; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b64 %rd2, [weak_acq_rel_seq_cst_i64_global_cta_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [weak_acq_rel_seq_cst_i64_global_cta_param_2];
-; SM90-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd4;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
-}
-
-define i64 @weak_seq_cst_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: weak_seq_cst_monotonic_i64_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b64 %rd<5>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [weak_seq_cst_monotonic_i64_global_cta_param_0];
-; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b64 %rd2, [weak_seq_cst_monotonic_i64_global_cta_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [weak_seq_cst_monotonic_i64_global_cta_param_2];
-; SM90-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd4;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
-}
-
-define i64 @weak_seq_cst_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: weak_seq_cst_acquire_i64_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b64 %rd<5>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [weak_seq_cst_acquire_i64_global_cta_param_0];
-; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b64 %rd2, [weak_seq_cst_acquire_i64_global_cta_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [weak_seq_cst_acquire_i64_global_cta_param_2];
-; SM90-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd4;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
-}
-
-define i64 @weak_seq_cst_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: weak_seq_cst_seq_cst_i64_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b64 %rd<5>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [weak_seq_cst_seq_cst_i64_global_cta_param_0];
-; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b64 %rd2, [weak_seq_cst_seq_cst_i64_global_cta_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [weak_seq_cst_seq_cst_i64_global_cta_param_2];
-; SM90-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd4;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
-}
-
-define i8 @weak_acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: weak_acq_rel_acquire_i8_global(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<17>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [weak_acq_rel_acquire_i8_global_param_0];
-; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.b8 %r6, [weak_acq_rel_acquire_i8_global_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r7, [weak_acq_rel_acquire_i8_global_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM90-NEXT:    and.b32 %r9, %r8, 3;
-; SM90-NEXT:    shl.b32 %r1, %r9, 3;
-; SM90-NEXT:    mov.b32 %r10, 255;
-; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM90-NEXT:    not.b32 %r12, %r11;
-; SM90-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM90-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r4, %r13, %r12;
-; SM90-NEXT:    or.b32 %r14, %r4, %r2;
-; SM90-NEXT:    or.b32 %r15, %r4, %r3;
-; SM90-NEXT:    atom.relaxed.sys.global.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i32 @weak_acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: weak_acq_rel_acquire_i32_global(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [weak_acq_rel_acquire_i32_global_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [weak_acq_rel_acquire_i32_global_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [weak_acq_rel_acquire_i32_global_param_2];
-; SM90-NEXT:    atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_acq_rel_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: weak_acq_rel_acquire_i32_global_sys(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [weak_acq_rel_acquire_i32_global_sys_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [weak_acq_rel_acquire_i32_global_sys_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [weak_acq_rel_acquire_i32_global_sys_param_2];
-; SM90-NEXT:    atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_acq_rel_acquire_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: weak_acq_rel_acquire_i32_global_cluster(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [weak_acq_rel_acquire_i32_global_cluster_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [weak_acq_rel_acquire_i32_global_cluster_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [weak_acq_rel_acquire_i32_global_cluster_param_2];
-; SM90-NEXT:    atom.acq_rel.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_acq_rel_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: weak_acq_rel_acquire_i32_global_gpu(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [weak_acq_rel_acquire_i32_global_gpu_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [weak_acq_rel_acquire_i32_global_gpu_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [weak_acq_rel_acquire_i32_global_gpu_param_2];
-; SM90-NEXT:    atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i8 @weak_acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: weak_acq_rel_acquire_i8_generic_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<17>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [weak_acq_rel_acquire_i8_generic_cta_param_0];
-; SM90-NEXT:    fence.release.cta;
-; SM90-NEXT:    ld.param.b8 %r6, [weak_acq_rel_acquire_i8_generic_cta_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r7, [weak_acq_rel_acquire_i8_generic_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
-; SM90-NEXT:    and.b32 %r9, %r8, 3;
-; SM90-NEXT:    shl.b32 %r1, %r9, 3;
-; SM90-NEXT:    mov.b32 %r10, 255;
-; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM90-NEXT:    not.b32 %r12, %r11;
-; SM90-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM90-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM90-NEXT:    ld.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r4, %r13, %r12;
-; SM90-NEXT:    or.b32 %r14, %r4, %r2;
-; SM90-NEXT:    or.b32 %r15, %r4, %r3;
-; SM90-NEXT:    atom.relaxed.cta.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i8 @weak_acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: weak_acq_rel_acquire_i8_shared_cta(
+define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_monotonic_i8_global_cta(
 ; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
 ; SM90-NEXT:    .reg .b32 %r<17>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [weak_acq_rel_acquire_i8_shared_cta_param_0];
-; SM90-NEXT:    fence.release.cta;
-; SM90-NEXT:    ld.param.b8 %r6, [weak_acq_rel_acquire_i8_shared_cta_param_1];
+; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cta_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r7, [weak_acq_rel_acquire_i8_shared_cta_param_2];
+; SM90-NEXT:    ld.param.b8 %r7, [monotonic_monotonic_i8_global_cta_param_1];
 ; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
 ; SM90-NEXT:    and.b32 %r9, %r8, 3;
 ; SM90-NEXT:    shl.b32 %r1, %r9, 3;
 ; SM90-NEXT:    mov.b32 %r10, 255;
 ; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
-; SM90-NEXT:    not.b32 %r12, %r11;
-; SM90-NEXT:    shl.b32 %r2, %r7, %r1;
-; SM90-NEXT:    shl.b32 %r3, %r6, %r1;
-; SM90-NEXT:    ld.shared.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r4, %r13, %r12;
-; SM90-NEXT:    or.b32 %r14, %r4, %r2;
-; SM90-NEXT:    or.b32 %r15, %r4, %r3;
-; SM90-NEXT:    atom.relaxed.cta.shared.cas.b32 %r5, [%rd1], %r15, %r14;
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
-}
-
-define i32 @weak_acq_rel_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: weak_acq_rel_acquire_i32_generic_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [weak_acq_rel_acquire_i32_generic_cta_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [weak_acq_rel_acquire_i32_generic_cta_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [weak_acq_rel_acquire_i32_generic_cta_param_2];
-; SM90-NEXT:    atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i32 @weak_acq_rel_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: weak_acq_rel_acquire_i32_shared_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [weak_acq_rel_acquire_i32_shared_cta_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [weak_acq_rel_acquire_i32_shared_cta_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [weak_acq_rel_acquire_i32_shared_cta_param_2];
-; SM90-NEXT:    atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg weak ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
-}
-
-define i8 @strong_monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: strong_monotonic_monotonic_i8_global_cta(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b32 %r<18>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [strong_monotonic_monotonic_i8_global_cta_param_0];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r7, [strong_monotonic_monotonic_i8_global_cta_param_1];
-; SM90-NEXT:    ld.param.b8 %r8, [strong_monotonic_monotonic_i8_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r17, %r13, %r2;
-; SM90-NEXT:  $L__BB69_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    and.b32 %r16, %r13, %r2;
+; SM90-NEXT:  $L__BB0_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r14, %r17, %r3;
-; SM90-NEXT:    or.b32 %r15, %r17, %r4;
+; SM90-NEXT:    or.b32 %r14, %r16, %r3;
+; SM90-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM90-NEXT:    @%p1 bra $L__BB69_3;
+; SM90-NEXT:    @%p1 bra $L__BB0_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB69_1 Depth=1
+; SM90-NEXT:    // in Loop: Header=BB0_1 Depth=1
 ; SM90-NEXT:    and.b32 %r6, %r5, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM90-NEXT:    mov.b32 %r17, %r6;
-; SM90-NEXT:    @%p2 bra $L__BB69_1;
-; SM90-NEXT:  $L__BB69_3: // %partword.cmpxchg.end
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM90-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM90-NEXT:    mov.b32 %r16, %r6;
+; SM90-NEXT:    @%p2 bra $L__BB0_1;
+; SM90-NEXT:  $L__BB0_3: // %partword.cmpxchg.end
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic
+    ret i8 %new
 }
 
-define i8 @strong_monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: strong_monotonic_acquire_i8_global_cta(
+define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_acquire_i8_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b32 %r<18>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<17>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [strong_monotonic_acquire_i8_global_cta_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_global_cta_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r7, [strong_monotonic_acquire_i8_global_cta_param_1];
-; SM90-NEXT:    ld.param.b8 %r8, [strong_monotonic_acquire_i8_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM90-NEXT:    ld.param.b8 %r7, [monotonic_acquire_i8_global_cta_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 255;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r17, %r13, %r2;
-; SM90-NEXT:  $L__BB70_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    and.b32 %r16, %r13, %r2;
+; SM90-NEXT:  $L__BB1_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r14, %r17, %r3;
-; SM90-NEXT:    or.b32 %r15, %r17, %r4;
+; SM90-NEXT:    or.b32 %r14, %r16, %r3;
+; SM90-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM90-NEXT:    @%p1 bra $L__BB70_3;
+; SM90-NEXT:    @%p1 bra $L__BB1_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB70_1 Depth=1
+; SM90-NEXT:    // in Loop: Header=BB1_1 Depth=1
 ; SM90-NEXT:    and.b32 %r6, %r5, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM90-NEXT:    mov.b32 %r17, %r6;
-; SM90-NEXT:    @%p2 bra $L__BB70_1;
-; SM90-NEXT:  $L__BB70_3: // %partword.cmpxchg.end
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM90-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM90-NEXT:    mov.b32 %r16, %r6;
+; SM90-NEXT:    @%p2 bra $L__BB1_1;
+; SM90-NEXT:  $L__BB1_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire
+    ret i8 %new
 }
 
-define i8 @strong_monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: strong_monotonic_seq_cst_i8_global_cta(
+define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_seq_cst_i8_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b32 %r<18>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<17>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [strong_monotonic_seq_cst_i8_global_cta_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0];
 ; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b8 %r7, [strong_monotonic_seq_cst_i8_global_cta_param_1];
+; SM90-NEXT:    ld.param.b8 %r7, [monotonic_seq_cst_i8_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r8, [strong_monotonic_seq_cst_i8_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 255;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r17, %r13, %r2;
-; SM90-NEXT:  $L__BB71_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    and.b32 %r16, %r13, %r2;
+; SM90-NEXT:  $L__BB2_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r14, %r17, %r3;
-; SM90-NEXT:    or.b32 %r15, %r17, %r4;
+; SM90-NEXT:    or.b32 %r14, %r16, %r3;
+; SM90-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM90-NEXT:    @%p1 bra $L__BB71_3;
+; SM90-NEXT:    @%p1 bra $L__BB2_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB71_1 Depth=1
+; SM90-NEXT:    // in Loop: Header=BB2_1 Depth=1
 ; SM90-NEXT:    and.b32 %r6, %r5, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM90-NEXT:    mov.b32 %r17, %r6;
-; SM90-NEXT:    @%p2 bra $L__BB71_1;
-; SM90-NEXT:  $L__BB71_3: // %partword.cmpxchg.end
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM90-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM90-NEXT:    mov.b32 %r16, %r6;
+; SM90-NEXT:    @%p2 bra $L__BB2_1;
+; SM90-NEXT:  $L__BB2_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst
+    ret i8 %new
 }
 
-define i8 @strong_acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: strong_acquire_monotonic_i8_global_cta(
+define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_monotonic_i8_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b32 %r<18>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<17>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [strong_acquire_monotonic_i8_global_cta_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_global_cta_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r7, [strong_acquire_monotonic_i8_global_cta_param_1];
-; SM90-NEXT:    ld.param.b8 %r8, [strong_acquire_monotonic_i8_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM90-NEXT:    ld.param.b8 %r7, [acquire_monotonic_i8_global_cta_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 255;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r17, %r13, %r2;
-; SM90-NEXT:  $L__BB72_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    and.b32 %r16, %r13, %r2;
+; SM90-NEXT:  $L__BB3_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r14, %r17, %r3;
-; SM90-NEXT:    or.b32 %r15, %r17, %r4;
+; SM90-NEXT:    or.b32 %r14, %r16, %r3;
+; SM90-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM90-NEXT:    @%p1 bra $L__BB72_3;
+; SM90-NEXT:    @%p1 bra $L__BB3_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB72_1 Depth=1
+; SM90-NEXT:    // in Loop: Header=BB3_1 Depth=1
 ; SM90-NEXT:    and.b32 %r6, %r5, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM90-NEXT:    mov.b32 %r17, %r6;
-; SM90-NEXT:    @%p2 bra $L__BB72_1;
-; SM90-NEXT:  $L__BB72_3: // %partword.cmpxchg.end
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM90-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM90-NEXT:    mov.b32 %r16, %r6;
+; SM90-NEXT:    @%p2 bra $L__BB3_1;
+; SM90-NEXT:  $L__BB3_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic
+    ret i8 %new
 }
 
-define i8 @strong_acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: strong_acquire_acquire_i8_global_cta(
+define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_acquire_i8_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b32 %r<18>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<17>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [strong_acquire_acquire_i8_global_cta_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_global_cta_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r7, [strong_acquire_acquire_i8_global_cta_param_1];
-; SM90-NEXT:    ld.param.b8 %r8, [strong_acquire_acquire_i8_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM90-NEXT:    ld.param.b8 %r7, [acquire_acquire_i8_global_cta_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 255;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r17, %r13, %r2;
-; SM90-NEXT:  $L__BB73_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    and.b32 %r16, %r13, %r2;
+; SM90-NEXT:  $L__BB4_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r14, %r17, %r3;
-; SM90-NEXT:    or.b32 %r15, %r17, %r4;
+; SM90-NEXT:    or.b32 %r14, %r16, %r3;
+; SM90-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM90-NEXT:    @%p1 bra $L__BB73_3;
+; SM90-NEXT:    @%p1 bra $L__BB4_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB73_1 Depth=1
+; SM90-NEXT:    // in Loop: Header=BB4_1 Depth=1
 ; SM90-NEXT:    and.b32 %r6, %r5, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM90-NEXT:    mov.b32 %r17, %r6;
-; SM90-NEXT:    @%p2 bra $L__BB73_1;
-; SM90-NEXT:  $L__BB73_3: // %partword.cmpxchg.end
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM90-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM90-NEXT:    mov.b32 %r16, %r6;
+; SM90-NEXT:    @%p2 bra $L__BB4_1;
+; SM90-NEXT:  $L__BB4_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire
+    ret i8 %new
 }
 
-define i8 @strong_acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: strong_acquire_seq_cst_i8_global_cta(
+define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_seq_cst_i8_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b32 %r<18>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<17>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [strong_acquire_seq_cst_i8_global_cta_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cta_param_0];
 ; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b8 %r7, [strong_acquire_seq_cst_i8_global_cta_param_1];
+; SM90-NEXT:    ld.param.b8 %r7, [acquire_seq_cst_i8_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r8, [strong_acquire_seq_cst_i8_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 255;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r17, %r13, %r2;
-; SM90-NEXT:  $L__BB74_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    and.b32 %r16, %r13, %r2;
+; SM90-NEXT:  $L__BB5_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r14, %r17, %r3;
-; SM90-NEXT:    or.b32 %r15, %r17, %r4;
+; SM90-NEXT:    or.b32 %r14, %r16, %r3;
+; SM90-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM90-NEXT:    @%p1 bra $L__BB74_3;
+; SM90-NEXT:    @%p1 bra $L__BB5_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB74_1 Depth=1
+; SM90-NEXT:    // in Loop: Header=BB5_1 Depth=1
 ; SM90-NEXT:    and.b32 %r6, %r5, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM90-NEXT:    mov.b32 %r17, %r6;
-; SM90-NEXT:    @%p2 bra $L__BB74_1;
-; SM90-NEXT:  $L__BB74_3: // %partword.cmpxchg.end
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM90-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM90-NEXT:    mov.b32 %r16, %r6;
+; SM90-NEXT:    @%p2 bra $L__BB5_1;
+; SM90-NEXT:  $L__BB5_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst
+    ret i8 %new
 }
 
-define i8 @strong_release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: strong_release_monotonic_i8_global_cta(
+define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_monotonic_i8_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b32 %r<18>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<17>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [strong_release_monotonic_i8_global_cta_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_global_cta_param_0];
 ; SM90-NEXT:    fence.release.cta;
-; SM90-NEXT:    ld.param.b8 %r7, [strong_release_monotonic_i8_global_cta_param_1];
+; SM90-NEXT:    ld.param.b8 %r7, [release_monotonic_i8_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r8, [strong_release_monotonic_i8_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 255;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r17, %r13, %r2;
-; SM90-NEXT:  $L__BB75_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    and.b32 %r16, %r13, %r2;
+; SM90-NEXT:  $L__BB6_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r14, %r17, %r3;
-; SM90-NEXT:    or.b32 %r15, %r17, %r4;
+; SM90-NEXT:    or.b32 %r14, %r16, %r3;
+; SM90-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM90-NEXT:    @%p1 bra $L__BB75_3;
+; SM90-NEXT:    @%p1 bra $L__BB6_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB75_1 Depth=1
+; SM90-NEXT:    // in Loop: Header=BB6_1 Depth=1
 ; SM90-NEXT:    and.b32 %r6, %r5, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM90-NEXT:    mov.b32 %r17, %r6;
-; SM90-NEXT:    @%p2 bra $L__BB75_1;
-; SM90-NEXT:  $L__BB75_3: // %partword.cmpxchg.end
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM90-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM90-NEXT:    mov.b32 %r16, %r6;
+; SM90-NEXT:    @%p2 bra $L__BB6_1;
+; SM90-NEXT:  $L__BB6_3: // %partword.cmpxchg.end
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic
+    ret i8 %new
 }
 
-define i8 @strong_release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: strong_release_acquire_i8_global_cta(
+define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_acquire_i8_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b32 %r<18>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<17>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [strong_release_acquire_i8_global_cta_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_global_cta_param_0];
 ; SM90-NEXT:    fence.release.cta;
-; SM90-NEXT:    ld.param.b8 %r7, [strong_release_acquire_i8_global_cta_param_1];
+; SM90-NEXT:    ld.param.b8 %r7, [release_acquire_i8_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r8, [strong_release_acquire_i8_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 255;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r17, %r13, %r2;
-; SM90-NEXT:  $L__BB76_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    and.b32 %r16, %r13, %r2;
+; SM90-NEXT:  $L__BB7_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r14, %r17, %r3;
-; SM90-NEXT:    or.b32 %r15, %r17, %r4;
+; SM90-NEXT:    or.b32 %r14, %r16, %r3;
+; SM90-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM90-NEXT:    @%p1 bra $L__BB76_3;
+; SM90-NEXT:    @%p1 bra $L__BB7_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB76_1 Depth=1
+; SM90-NEXT:    // in Loop: Header=BB7_1 Depth=1
 ; SM90-NEXT:    and.b32 %r6, %r5, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM90-NEXT:    mov.b32 %r17, %r6;
-; SM90-NEXT:    @%p2 bra $L__BB76_1;
-; SM90-NEXT:  $L__BB76_3: // %partword.cmpxchg.end
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM90-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM90-NEXT:    mov.b32 %r16, %r6;
+; SM90-NEXT:    @%p2 bra $L__BB7_1;
+; SM90-NEXT:  $L__BB7_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire
+    ret i8 %new
 }
 
-define i8 @strong_release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: strong_release_seq_cst_i8_global_cta(
+define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_seq_cst_i8_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b32 %r<18>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<17>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [strong_release_seq_cst_i8_global_cta_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_global_cta_param_0];
 ; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b8 %r7, [strong_release_seq_cst_i8_global_cta_param_1];
+; SM90-NEXT:    ld.param.b8 %r7, [release_seq_cst_i8_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r8, [strong_release_seq_cst_i8_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 255;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r17, %r13, %r2;
-; SM90-NEXT:  $L__BB77_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    and.b32 %r16, %r13, %r2;
+; SM90-NEXT:  $L__BB8_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r14, %r17, %r3;
-; SM90-NEXT:    or.b32 %r15, %r17, %r4;
+; SM90-NEXT:    or.b32 %r14, %r16, %r3;
+; SM90-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM90-NEXT:    @%p1 bra $L__BB77_3;
+; SM90-NEXT:    @%p1 bra $L__BB8_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB77_1 Depth=1
+; SM90-NEXT:    // in Loop: Header=BB8_1 Depth=1
 ; SM90-NEXT:    and.b32 %r6, %r5, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM90-NEXT:    mov.b32 %r17, %r6;
-; SM90-NEXT:    @%p2 bra $L__BB77_1;
-; SM90-NEXT:  $L__BB77_3: // %partword.cmpxchg.end
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM90-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM90-NEXT:    mov.b32 %r16, %r6;
+; SM90-NEXT:    @%p2 bra $L__BB8_1;
+; SM90-NEXT:  $L__BB8_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst
+    ret i8 %new
 }
 
-define i8 @strong_acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: strong_acq_rel_monotonic_i8_global_cta(
+define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_monotonic_i8_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b32 %r<18>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<17>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [strong_acq_rel_monotonic_i8_global_cta_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0];
 ; SM90-NEXT:    fence.release.cta;
-; SM90-NEXT:    ld.param.b8 %r7, [strong_acq_rel_monotonic_i8_global_cta_param_1];
+; SM90-NEXT:    ld.param.b8 %r7, [acq_rel_monotonic_i8_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r8, [strong_acq_rel_monotonic_i8_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 255;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r17, %r13, %r2;
-; SM90-NEXT:  $L__BB78_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    and.b32 %r16, %r13, %r2;
+; SM90-NEXT:  $L__BB9_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r14, %r17, %r3;
-; SM90-NEXT:    or.b32 %r15, %r17, %r4;
+; SM90-NEXT:    or.b32 %r14, %r16, %r3;
+; SM90-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM90-NEXT:    @%p1 bra $L__BB78_3;
+; SM90-NEXT:    @%p1 bra $L__BB9_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB78_1 Depth=1
+; SM90-NEXT:    // in Loop: Header=BB9_1 Depth=1
 ; SM90-NEXT:    and.b32 %r6, %r5, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM90-NEXT:    mov.b32 %r17, %r6;
-; SM90-NEXT:    @%p2 bra $L__BB78_1;
-; SM90-NEXT:  $L__BB78_3: // %partword.cmpxchg.end
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM90-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM90-NEXT:    mov.b32 %r16, %r6;
+; SM90-NEXT:    @%p2 bra $L__BB9_1;
+; SM90-NEXT:  $L__BB9_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic
+    ret i8 %new
 }
 
-define i8 @strong_acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: strong_acq_rel_acquire_i8_global_cta(
+define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_acquire_i8_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b32 %r<18>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<17>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [strong_acq_rel_acquire_i8_global_cta_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cta_param_0];
 ; SM90-NEXT:    fence.release.cta;
-; SM90-NEXT:    ld.param.b8 %r7, [strong_acq_rel_acquire_i8_global_cta_param_1];
+; SM90-NEXT:    ld.param.b8 %r7, [acq_rel_acquire_i8_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r8, [strong_acq_rel_acquire_i8_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 255;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r17, %r13, %r2;
-; SM90-NEXT:  $L__BB79_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    and.b32 %r16, %r13, %r2;
+; SM90-NEXT:  $L__BB10_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r14, %r17, %r3;
-; SM90-NEXT:    or.b32 %r15, %r17, %r4;
+; SM90-NEXT:    or.b32 %r14, %r16, %r3;
+; SM90-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM90-NEXT:    @%p1 bra $L__BB79_3;
+; SM90-NEXT:    @%p1 bra $L__BB10_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB79_1 Depth=1
+; SM90-NEXT:    // in Loop: Header=BB10_1 Depth=1
 ; SM90-NEXT:    and.b32 %r6, %r5, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM90-NEXT:    mov.b32 %r17, %r6;
-; SM90-NEXT:    @%p2 bra $L__BB79_1;
-; SM90-NEXT:  $L__BB79_3: // %partword.cmpxchg.end
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM90-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM90-NEXT:    mov.b32 %r16, %r6;
+; SM90-NEXT:    @%p2 bra $L__BB10_1;
+; SM90-NEXT:  $L__BB10_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
+    ret i8 %new
 }
 
-define i8 @strong_acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: strong_acq_rel_seq_cst_i8_global_cta(
+define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i8_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b32 %r<18>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<17>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [strong_acq_rel_seq_cst_i8_global_cta_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0];
 ; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b8 %r7, [strong_acq_rel_seq_cst_i8_global_cta_param_1];
+; SM90-NEXT:    ld.param.b8 %r7, [acq_rel_seq_cst_i8_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r8, [strong_acq_rel_seq_cst_i8_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 255;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r17, %r13, %r2;
-; SM90-NEXT:  $L__BB80_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    and.b32 %r16, %r13, %r2;
+; SM90-NEXT:  $L__BB11_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r14, %r17, %r3;
-; SM90-NEXT:    or.b32 %r15, %r17, %r4;
+; SM90-NEXT:    or.b32 %r14, %r16, %r3;
+; SM90-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM90-NEXT:    @%p1 bra $L__BB80_3;
+; SM90-NEXT:    @%p1 bra $L__BB11_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB80_1 Depth=1
+; SM90-NEXT:    // in Loop: Header=BB11_1 Depth=1
 ; SM90-NEXT:    and.b32 %r6, %r5, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM90-NEXT:    mov.b32 %r17, %r6;
-; SM90-NEXT:    @%p2 bra $L__BB80_1;
-; SM90-NEXT:  $L__BB80_3: // %partword.cmpxchg.end
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM90-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM90-NEXT:    mov.b32 %r16, %r6;
+; SM90-NEXT:    @%p2 bra $L__BB11_1;
+; SM90-NEXT:  $L__BB11_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst
+    ret i8 %new
 }
 
-define i8 @strong_seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: strong_seq_cst_monotonic_i8_global_cta(
+define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_monotonic_i8_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b32 %r<18>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<17>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [strong_seq_cst_monotonic_i8_global_cta_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0];
 ; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b8 %r7, [strong_seq_cst_monotonic_i8_global_cta_param_1];
+; SM90-NEXT:    ld.param.b8 %r7, [seq_cst_monotonic_i8_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r8, [strong_seq_cst_monotonic_i8_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 255;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r17, %r13, %r2;
-; SM90-NEXT:  $L__BB81_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    and.b32 %r16, %r13, %r2;
+; SM90-NEXT:  $L__BB12_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r14, %r17, %r3;
-; SM90-NEXT:    or.b32 %r15, %r17, %r4;
+; SM90-NEXT:    or.b32 %r14, %r16, %r3;
+; SM90-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM90-NEXT:    @%p1 bra $L__BB81_3;
+; SM90-NEXT:    @%p1 bra $L__BB12_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB81_1 Depth=1
+; SM90-NEXT:    // in Loop: Header=BB12_1 Depth=1
 ; SM90-NEXT:    and.b32 %r6, %r5, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM90-NEXT:    mov.b32 %r17, %r6;
-; SM90-NEXT:    @%p2 bra $L__BB81_1;
-; SM90-NEXT:  $L__BB81_3: // %partword.cmpxchg.end
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM90-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM90-NEXT:    mov.b32 %r16, %r6;
+; SM90-NEXT:    @%p2 bra $L__BB12_1;
+; SM90-NEXT:  $L__BB12_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic
+    ret i8 %new
 }
 
-define i8 @strong_seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: strong_seq_cst_acquire_i8_global_cta(
+define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_acquire_i8_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b32 %r<18>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<17>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [strong_seq_cst_acquire_i8_global_cta_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cta_param_0];
 ; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b8 %r7, [strong_seq_cst_acquire_i8_global_cta_param_1];
+; SM90-NEXT:    ld.param.b8 %r7, [seq_cst_acquire_i8_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r8, [strong_seq_cst_acquire_i8_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 255;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r17, %r13, %r2;
-; SM90-NEXT:  $L__BB82_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    and.b32 %r16, %r13, %r2;
+; SM90-NEXT:  $L__BB13_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r14, %r17, %r3;
-; SM90-NEXT:    or.b32 %r15, %r17, %r4;
+; SM90-NEXT:    or.b32 %r14, %r16, %r3;
+; SM90-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM90-NEXT:    @%p1 bra $L__BB82_3;
+; SM90-NEXT:    @%p1 bra $L__BB13_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB82_1 Depth=1
+; SM90-NEXT:    // in Loop: Header=BB13_1 Depth=1
 ; SM90-NEXT:    and.b32 %r6, %r5, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM90-NEXT:    mov.b32 %r17, %r6;
-; SM90-NEXT:    @%p2 bra $L__BB82_1;
-; SM90-NEXT:  $L__BB82_3: // %partword.cmpxchg.end
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM90-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM90-NEXT:    mov.b32 %r16, %r6;
+; SM90-NEXT:    @%p2 bra $L__BB13_1;
+; SM90-NEXT:  $L__BB13_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire
+    ret i8 %new
 }
 
-define i8 @strong_seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: strong_seq_cst_seq_cst_i8_global_cta(
+define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i8_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b32 %r<18>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<17>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [strong_seq_cst_seq_cst_i8_global_cta_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0];
 ; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b8 %r7, [strong_seq_cst_seq_cst_i8_global_cta_param_1];
+; SM90-NEXT:    ld.param.b8 %r7, [seq_cst_seq_cst_i8_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r8, [strong_seq_cst_seq_cst_i8_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 255;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r17, %r13, %r2;
-; SM90-NEXT:  $L__BB83_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    and.b32 %r16, %r13, %r2;
+; SM90-NEXT:  $L__BB14_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r14, %r17, %r3;
-; SM90-NEXT:    or.b32 %r15, %r17, %r4;
+; SM90-NEXT:    or.b32 %r14, %r16, %r3;
+; SM90-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM90-NEXT:    @%p1 bra $L__BB83_3;
+; SM90-NEXT:    @%p1 bra $L__BB14_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB83_1 Depth=1
+; SM90-NEXT:    // in Loop: Header=BB14_1 Depth=1
 ; SM90-NEXT:    and.b32 %r6, %r5, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM90-NEXT:    mov.b32 %r17, %r6;
-; SM90-NEXT:    @%p2 bra $L__BB83_1;
-; SM90-NEXT:  $L__BB83_3: // %partword.cmpxchg.end
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM90-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM90-NEXT:    mov.b32 %r16, %r6;
+; SM90-NEXT:    @%p2 bra $L__BB14_1;
+; SM90-NEXT:  $L__BB14_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst
+    ret i8 %new
 }
 
-define i16 @strong_monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: strong_monotonic_monotonic_i16_global_cta(
+define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_monotonic_i16_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b32 %r<18>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<17>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [strong_monotonic_monotonic_i16_global_cta_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_global_cta_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r7, [strong_monotonic_monotonic_i16_global_cta_param_1];
-; SM90-NEXT:    ld.param.b16 %r8, [strong_monotonic_monotonic_i16_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 65535;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM90-NEXT:    ld.param.b16 %r7, [monotonic_monotonic_i16_global_cta_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 65535;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r17, %r13, %r2;
-; SM90-NEXT:  $L__BB84_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    and.b32 %r16, %r13, %r2;
+; SM90-NEXT:  $L__BB15_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r14, %r17, %r3;
-; SM90-NEXT:    or.b32 %r15, %r17, %r4;
+; SM90-NEXT:    or.b32 %r14, %r16, %r3;
+; SM90-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM90-NEXT:    @%p1 bra $L__BB84_3;
+; SM90-NEXT:    @%p1 bra $L__BB15_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB84_1 Depth=1
+; SM90-NEXT:    // in Loop: Header=BB15_1 Depth=1
 ; SM90-NEXT:    and.b32 %r6, %r5, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM90-NEXT:    mov.b32 %r17, %r6;
-; SM90-NEXT:    @%p2 bra $L__BB84_1;
-; SM90-NEXT:  $L__BB84_3: // %partword.cmpxchg.end
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM90-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM90-NEXT:    mov.b32 %r16, %r6;
+; SM90-NEXT:    @%p2 bra $L__BB15_1;
+; SM90-NEXT:  $L__BB15_3: // %partword.cmpxchg.end
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic
+    ret i16 %new
 }
 
-define i16 @strong_monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: strong_monotonic_acquire_i16_global_cta(
+define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_acquire_i16_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b32 %r<18>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<17>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [strong_monotonic_acquire_i16_global_cta_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_global_cta_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r7, [strong_monotonic_acquire_i16_global_cta_param_1];
-; SM90-NEXT:    ld.param.b16 %r8, [strong_monotonic_acquire_i16_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 65535;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM90-NEXT:    ld.param.b16 %r7, [monotonic_acquire_i16_global_cta_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 65535;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r17, %r13, %r2;
-; SM90-NEXT:  $L__BB85_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    and.b32 %r16, %r13, %r2;
+; SM90-NEXT:  $L__BB16_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r14, %r17, %r3;
-; SM90-NEXT:    or.b32 %r15, %r17, %r4;
+; SM90-NEXT:    or.b32 %r14, %r16, %r3;
+; SM90-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM90-NEXT:    @%p1 bra $L__BB85_3;
+; SM90-NEXT:    @%p1 bra $L__BB16_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB85_1 Depth=1
+; SM90-NEXT:    // in Loop: Header=BB16_1 Depth=1
 ; SM90-NEXT:    and.b32 %r6, %r5, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM90-NEXT:    mov.b32 %r17, %r6;
-; SM90-NEXT:    @%p2 bra $L__BB85_1;
-; SM90-NEXT:  $L__BB85_3: // %partword.cmpxchg.end
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM90-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM90-NEXT:    mov.b32 %r16, %r6;
+; SM90-NEXT:    @%p2 bra $L__BB16_1;
+; SM90-NEXT:  $L__BB16_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire
+    ret i16 %new
 }
 
-define i16 @strong_monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: strong_monotonic_seq_cst_i16_global_cta(
+define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_seq_cst_i16_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b32 %r<18>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<17>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [strong_monotonic_seq_cst_i16_global_cta_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0];
 ; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b16 %r7, [strong_monotonic_seq_cst_i16_global_cta_param_1];
+; SM90-NEXT:    ld.param.b16 %r7, [monotonic_seq_cst_i16_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r8, [strong_monotonic_seq_cst_i16_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 65535;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 65535;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r17, %r13, %r2;
-; SM90-NEXT:  $L__BB86_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    and.b32 %r16, %r13, %r2;
+; SM90-NEXT:  $L__BB17_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r14, %r17, %r3;
-; SM90-NEXT:    or.b32 %r15, %r17, %r4;
+; SM90-NEXT:    or.b32 %r14, %r16, %r3;
+; SM90-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM90-NEXT:    @%p1 bra $L__BB86_3;
+; SM90-NEXT:    @%p1 bra $L__BB17_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB86_1 Depth=1
+; SM90-NEXT:    // in Loop: Header=BB17_1 Depth=1
 ; SM90-NEXT:    and.b32 %r6, %r5, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM90-NEXT:    mov.b32 %r17, %r6;
-; SM90-NEXT:    @%p2 bra $L__BB86_1;
-; SM90-NEXT:  $L__BB86_3: // %partword.cmpxchg.end
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM90-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM90-NEXT:    mov.b32 %r16, %r6;
+; SM90-NEXT:    @%p2 bra $L__BB17_1;
+; SM90-NEXT:  $L__BB17_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst
+    ret i16 %new
 }
 
-define i16 @strong_acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: strong_acquire_monotonic_i16_global_cta(
+define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_monotonic_i16_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b32 %r<18>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<17>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [strong_acquire_monotonic_i16_global_cta_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_global_cta_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r7, [strong_acquire_monotonic_i16_global_cta_param_1];
-; SM90-NEXT:    ld.param.b16 %r8, [strong_acquire_monotonic_i16_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 65535;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM90-NEXT:    ld.param.b16 %r7, [acquire_monotonic_i16_global_cta_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 65535;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r17, %r13, %r2;
-; SM90-NEXT:  $L__BB87_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    and.b32 %r16, %r13, %r2;
+; SM90-NEXT:  $L__BB18_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r14, %r17, %r3;
-; SM90-NEXT:    or.b32 %r15, %r17, %r4;
+; SM90-NEXT:    or.b32 %r14, %r16, %r3;
+; SM90-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM90-NEXT:    @%p1 bra $L__BB87_3;
+; SM90-NEXT:    @%p1 bra $L__BB18_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB87_1 Depth=1
+; SM90-NEXT:    // in Loop: Header=BB18_1 Depth=1
 ; SM90-NEXT:    and.b32 %r6, %r5, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM90-NEXT:    mov.b32 %r17, %r6;
-; SM90-NEXT:    @%p2 bra $L__BB87_1;
-; SM90-NEXT:  $L__BB87_3: // %partword.cmpxchg.end
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM90-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM90-NEXT:    mov.b32 %r16, %r6;
+; SM90-NEXT:    @%p2 bra $L__BB18_1;
+; SM90-NEXT:  $L__BB18_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic
+    ret i16 %new
 }
 
-define i16 @strong_acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: strong_acquire_acquire_i16_global_cta(
+define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_acquire_i16_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b32 %r<18>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<17>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [strong_acquire_acquire_i16_global_cta_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_global_cta_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r7, [strong_acquire_acquire_i16_global_cta_param_1];
-; SM90-NEXT:    ld.param.b16 %r8, [strong_acquire_acquire_i16_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 65535;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM90-NEXT:    ld.param.b16 %r7, [acquire_acquire_i16_global_cta_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 65535;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r17, %r13, %r2;
-; SM90-NEXT:  $L__BB88_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    and.b32 %r16, %r13, %r2;
+; SM90-NEXT:  $L__BB19_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r14, %r17, %r3;
-; SM90-NEXT:    or.b32 %r15, %r17, %r4;
+; SM90-NEXT:    or.b32 %r14, %r16, %r3;
+; SM90-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM90-NEXT:    @%p1 bra $L__BB88_3;
+; SM90-NEXT:    @%p1 bra $L__BB19_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB88_1 Depth=1
+; SM90-NEXT:    // in Loop: Header=BB19_1 Depth=1
 ; SM90-NEXT:    and.b32 %r6, %r5, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM90-NEXT:    mov.b32 %r17, %r6;
-; SM90-NEXT:    @%p2 bra $L__BB88_1;
-; SM90-NEXT:  $L__BB88_3: // %partword.cmpxchg.end
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM90-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM90-NEXT:    mov.b32 %r16, %r6;
+; SM90-NEXT:    @%p2 bra $L__BB19_1;
+; SM90-NEXT:  $L__BB19_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire
+    ret i16 %new
 }
 
-define i16 @strong_acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: strong_acquire_seq_cst_i16_global_cta(
+define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_seq_cst_i16_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b32 %r<18>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<17>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [strong_acquire_seq_cst_i16_global_cta_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_global_cta_param_0];
 ; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b16 %r7, [strong_acquire_seq_cst_i16_global_cta_param_1];
+; SM90-NEXT:    ld.param.b16 %r7, [acquire_seq_cst_i16_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r8, [strong_acquire_seq_cst_i16_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 65535;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 65535;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r17, %r13, %r2;
-; SM90-NEXT:  $L__BB89_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    and.b32 %r16, %r13, %r2;
+; SM90-NEXT:  $L__BB20_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r14, %r17, %r3;
-; SM90-NEXT:    or.b32 %r15, %r17, %r4;
+; SM90-NEXT:    or.b32 %r14, %r16, %r3;
+; SM90-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM90-NEXT:    @%p1 bra $L__BB89_3;
+; SM90-NEXT:    @%p1 bra $L__BB20_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB89_1 Depth=1
+; SM90-NEXT:    // in Loop: Header=BB20_1 Depth=1
 ; SM90-NEXT:    and.b32 %r6, %r5, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM90-NEXT:    mov.b32 %r17, %r6;
-; SM90-NEXT:    @%p2 bra $L__BB89_1;
-; SM90-NEXT:  $L__BB89_3: // %partword.cmpxchg.end
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM90-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM90-NEXT:    mov.b32 %r16, %r6;
+; SM90-NEXT:    @%p2 bra $L__BB20_1;
+; SM90-NEXT:  $L__BB20_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst
+    ret i16 %new
 }
 
-define i16 @strong_release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: strong_release_monotonic_i16_global_cta(
+define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_monotonic_i16_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b32 %r<18>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<17>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [strong_release_monotonic_i16_global_cta_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_global_cta_param_0];
 ; SM90-NEXT:    fence.release.cta;
-; SM90-NEXT:    ld.param.b16 %r7, [strong_release_monotonic_i16_global_cta_param_1];
+; SM90-NEXT:    ld.param.b16 %r7, [release_monotonic_i16_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r8, [strong_release_monotonic_i16_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 65535;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 65535;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r17, %r13, %r2;
-; SM90-NEXT:  $L__BB90_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    and.b32 %r16, %r13, %r2;
+; SM90-NEXT:  $L__BB21_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r14, %r17, %r3;
-; SM90-NEXT:    or.b32 %r15, %r17, %r4;
+; SM90-NEXT:    or.b32 %r14, %r16, %r3;
+; SM90-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM90-NEXT:    @%p1 bra $L__BB90_3;
+; SM90-NEXT:    @%p1 bra $L__BB21_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB90_1 Depth=1
+; SM90-NEXT:    // in Loop: Header=BB21_1 Depth=1
 ; SM90-NEXT:    and.b32 %r6, %r5, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM90-NEXT:    mov.b32 %r17, %r6;
-; SM90-NEXT:    @%p2 bra $L__BB90_1;
-; SM90-NEXT:  $L__BB90_3: // %partword.cmpxchg.end
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM90-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM90-NEXT:    mov.b32 %r16, %r6;
+; SM90-NEXT:    @%p2 bra $L__BB21_1;
+; SM90-NEXT:  $L__BB21_3: // %partword.cmpxchg.end
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic
+    ret i16 %new
 }
 
-define i16 @strong_release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: strong_release_acquire_i16_global_cta(
+define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_acquire_i16_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b32 %r<18>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<17>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [strong_release_acquire_i16_global_cta_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_global_cta_param_0];
 ; SM90-NEXT:    fence.release.cta;
-; SM90-NEXT:    ld.param.b16 %r7, [strong_release_acquire_i16_global_cta_param_1];
+; SM90-NEXT:    ld.param.b16 %r7, [release_acquire_i16_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r8, [strong_release_acquire_i16_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 65535;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 65535;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r17, %r13, %r2;
-; SM90-NEXT:  $L__BB91_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    and.b32 %r16, %r13, %r2;
+; SM90-NEXT:  $L__BB22_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r14, %r17, %r3;
-; SM90-NEXT:    or.b32 %r15, %r17, %r4;
+; SM90-NEXT:    or.b32 %r14, %r16, %r3;
+; SM90-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM90-NEXT:    @%p1 bra $L__BB91_3;
+; SM90-NEXT:    @%p1 bra $L__BB22_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB91_1 Depth=1
+; SM90-NEXT:    // in Loop: Header=BB22_1 Depth=1
 ; SM90-NEXT:    and.b32 %r6, %r5, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM90-NEXT:    mov.b32 %r17, %r6;
-; SM90-NEXT:    @%p2 bra $L__BB91_1;
-; SM90-NEXT:  $L__BB91_3: // %partword.cmpxchg.end
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM90-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM90-NEXT:    mov.b32 %r16, %r6;
+; SM90-NEXT:    @%p2 bra $L__BB22_1;
+; SM90-NEXT:  $L__BB22_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release acquire
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release acquire
+    ret i16 %new
 }
 
-define i16 @strong_release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: strong_release_seq_cst_i16_global_cta(
+define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_seq_cst_i16_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b32 %r<18>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<17>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [strong_release_seq_cst_i16_global_cta_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_global_cta_param_0];
 ; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b16 %r7, [strong_release_seq_cst_i16_global_cta_param_1];
+; SM90-NEXT:    ld.param.b16 %r7, [release_seq_cst_i16_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r8, [strong_release_seq_cst_i16_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 65535;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 65535;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r17, %r13, %r2;
-; SM90-NEXT:  $L__BB92_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    and.b32 %r16, %r13, %r2;
+; SM90-NEXT:  $L__BB23_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r14, %r17, %r3;
-; SM90-NEXT:    or.b32 %r15, %r17, %r4;
+; SM90-NEXT:    or.b32 %r14, %r16, %r3;
+; SM90-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM90-NEXT:    @%p1 bra $L__BB92_3;
+; SM90-NEXT:    @%p1 bra $L__BB23_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB92_1 Depth=1
+; SM90-NEXT:    // in Loop: Header=BB23_1 Depth=1
 ; SM90-NEXT:    and.b32 %r6, %r5, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM90-NEXT:    mov.b32 %r17, %r6;
-; SM90-NEXT:    @%p2 bra $L__BB92_1;
-; SM90-NEXT:  $L__BB92_3: // %partword.cmpxchg.end
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM90-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM90-NEXT:    mov.b32 %r16, %r6;
+; SM90-NEXT:    @%p2 bra $L__BB23_1;
+; SM90-NEXT:  $L__BB23_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst
+    ret i16 %new
 }
 
-define i16 @strong_acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: strong_acq_rel_monotonic_i16_global_cta(
+define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_monotonic_i16_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b32 %r<18>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<17>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [strong_acq_rel_monotonic_i16_global_cta_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0];
 ; SM90-NEXT:    fence.release.cta;
-; SM90-NEXT:    ld.param.b16 %r7, [strong_acq_rel_monotonic_i16_global_cta_param_1];
+; SM90-NEXT:    ld.param.b16 %r7, [acq_rel_monotonic_i16_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r8, [strong_acq_rel_monotonic_i16_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 65535;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 65535;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r17, %r13, %r2;
-; SM90-NEXT:  $L__BB93_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    and.b32 %r16, %r13, %r2;
+; SM90-NEXT:  $L__BB24_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r14, %r17, %r3;
-; SM90-NEXT:    or.b32 %r15, %r17, %r4;
+; SM90-NEXT:    or.b32 %r14, %r16, %r3;
+; SM90-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM90-NEXT:    @%p1 bra $L__BB93_3;
+; SM90-NEXT:    @%p1 bra $L__BB24_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB93_1 Depth=1
+; SM90-NEXT:    // in Loop: Header=BB24_1 Depth=1
 ; SM90-NEXT:    and.b32 %r6, %r5, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM90-NEXT:    mov.b32 %r17, %r6;
-; SM90-NEXT:    @%p2 bra $L__BB93_1;
-; SM90-NEXT:  $L__BB93_3: // %partword.cmpxchg.end
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM90-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM90-NEXT:    mov.b32 %r16, %r6;
+; SM90-NEXT:    @%p2 bra $L__BB24_1;
+; SM90-NEXT:  $L__BB24_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic
+    ret i16 %new
 }
 
-define i16 @strong_acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: strong_acq_rel_acquire_i16_global_cta(
+define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_acquire_i16_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b32 %r<18>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<17>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [strong_acq_rel_acquire_i16_global_cta_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_global_cta_param_0];
 ; SM90-NEXT:    fence.release.cta;
-; SM90-NEXT:    ld.param.b16 %r7, [strong_acq_rel_acquire_i16_global_cta_param_1];
+; SM90-NEXT:    ld.param.b16 %r7, [acq_rel_acquire_i16_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r8, [strong_acq_rel_acquire_i16_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 65535;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 65535;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r17, %r13, %r2;
-; SM90-NEXT:  $L__BB94_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    and.b32 %r16, %r13, %r2;
+; SM90-NEXT:  $L__BB25_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r14, %r17, %r3;
-; SM90-NEXT:    or.b32 %r15, %r17, %r4;
+; SM90-NEXT:    or.b32 %r14, %r16, %r3;
+; SM90-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM90-NEXT:    @%p1 bra $L__BB94_3;
+; SM90-NEXT:    @%p1 bra $L__BB25_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB94_1 Depth=1
+; SM90-NEXT:    // in Loop: Header=BB25_1 Depth=1
 ; SM90-NEXT:    and.b32 %r6, %r5, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM90-NEXT:    mov.b32 %r17, %r6;
-; SM90-NEXT:    @%p2 bra $L__BB94_1;
-; SM90-NEXT:  $L__BB94_3: // %partword.cmpxchg.end
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM90-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM90-NEXT:    mov.b32 %r16, %r6;
+; SM90-NEXT:    @%p2 bra $L__BB25_1;
+; SM90-NEXT:  $L__BB25_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire
+    ret i16 %new
 }
 
-define i16 @strong_acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: strong_acq_rel_seq_cst_i16_global_cta(
+define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i16_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b32 %r<18>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<17>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [strong_acq_rel_seq_cst_i16_global_cta_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0];
 ; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b16 %r7, [strong_acq_rel_seq_cst_i16_global_cta_param_1];
+; SM90-NEXT:    ld.param.b16 %r7, [acq_rel_seq_cst_i16_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r8, [strong_acq_rel_seq_cst_i16_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 65535;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 65535;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r17, %r13, %r2;
-; SM90-NEXT:  $L__BB95_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    and.b32 %r16, %r13, %r2;
+; SM90-NEXT:  $L__BB26_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r14, %r17, %r3;
-; SM90-NEXT:    or.b32 %r15, %r17, %r4;
+; SM90-NEXT:    or.b32 %r14, %r16, %r3;
+; SM90-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM90-NEXT:    @%p1 bra $L__BB95_3;
+; SM90-NEXT:    @%p1 bra $L__BB26_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB95_1 Depth=1
+; SM90-NEXT:    // in Loop: Header=BB26_1 Depth=1
 ; SM90-NEXT:    and.b32 %r6, %r5, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM90-NEXT:    mov.b32 %r17, %r6;
-; SM90-NEXT:    @%p2 bra $L__BB95_1;
-; SM90-NEXT:  $L__BB95_3: // %partword.cmpxchg.end
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM90-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM90-NEXT:    mov.b32 %r16, %r6;
+; SM90-NEXT:    @%p2 bra $L__BB26_1;
+; SM90-NEXT:  $L__BB26_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst
+    ret i16 %new
 }
 
-define i16 @strong_seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: strong_seq_cst_monotonic_i16_global_cta(
+define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_monotonic_i16_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b32 %r<18>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<17>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [strong_seq_cst_monotonic_i16_global_cta_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0];
 ; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b16 %r7, [strong_seq_cst_monotonic_i16_global_cta_param_1];
+; SM90-NEXT:    ld.param.b16 %r7, [seq_cst_monotonic_i16_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r8, [strong_seq_cst_monotonic_i16_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 65535;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 65535;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r17, %r13, %r2;
-; SM90-NEXT:  $L__BB96_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    and.b32 %r16, %r13, %r2;
+; SM90-NEXT:  $L__BB27_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r14, %r17, %r3;
-; SM90-NEXT:    or.b32 %r15, %r17, %r4;
+; SM90-NEXT:    or.b32 %r14, %r16, %r3;
+; SM90-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM90-NEXT:    @%p1 bra $L__BB96_3;
+; SM90-NEXT:    @%p1 bra $L__BB27_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB96_1 Depth=1
+; SM90-NEXT:    // in Loop: Header=BB27_1 Depth=1
 ; SM90-NEXT:    and.b32 %r6, %r5, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM90-NEXT:    mov.b32 %r17, %r6;
-; SM90-NEXT:    @%p2 bra $L__BB96_1;
-; SM90-NEXT:  $L__BB96_3: // %partword.cmpxchg.end
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM90-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM90-NEXT:    mov.b32 %r16, %r6;
+; SM90-NEXT:    @%p2 bra $L__BB27_1;
+; SM90-NEXT:  $L__BB27_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic
+    ret i16 %new
 }
 
-define i16 @strong_seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: strong_seq_cst_acquire_i16_global_cta(
+define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_acquire_i16_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b32 %r<18>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<17>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [strong_seq_cst_acquire_i16_global_cta_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_global_cta_param_0];
 ; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b16 %r7, [strong_seq_cst_acquire_i16_global_cta_param_1];
+; SM90-NEXT:    ld.param.b16 %r7, [seq_cst_acquire_i16_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r8, [strong_seq_cst_acquire_i16_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 65535;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 65535;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r17, %r13, %r2;
-; SM90-NEXT:  $L__BB97_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    and.b32 %r16, %r13, %r2;
+; SM90-NEXT:  $L__BB28_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r14, %r17, %r3;
-; SM90-NEXT:    or.b32 %r15, %r17, %r4;
+; SM90-NEXT:    or.b32 %r14, %r16, %r3;
+; SM90-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM90-NEXT:    @%p1 bra $L__BB97_3;
+; SM90-NEXT:    @%p1 bra $L__BB28_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB97_1 Depth=1
+; SM90-NEXT:    // in Loop: Header=BB28_1 Depth=1
 ; SM90-NEXT:    and.b32 %r6, %r5, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM90-NEXT:    mov.b32 %r17, %r6;
-; SM90-NEXT:    @%p2 bra $L__BB97_1;
-; SM90-NEXT:  $L__BB97_3: // %partword.cmpxchg.end
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM90-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM90-NEXT:    mov.b32 %r16, %r6;
+; SM90-NEXT:    @%p2 bra $L__BB28_1;
+; SM90-NEXT:  $L__BB28_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire
+    ret i16 %new
 }
 
-define i16 @strong_seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: strong_seq_cst_seq_cst_i16_global_cta(
+define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i16_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b32 %r<18>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<17>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [strong_seq_cst_seq_cst_i16_global_cta_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0];
 ; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b16 %r7, [strong_seq_cst_seq_cst_i16_global_cta_param_1];
+; SM90-NEXT:    ld.param.b16 %r7, [seq_cst_seq_cst_i16_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r8, [strong_seq_cst_seq_cst_i16_global_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 65535;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 65535;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r17, %r13, %r2;
-; SM90-NEXT:  $L__BB98_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    and.b32 %r16, %r13, %r2;
+; SM90-NEXT:  $L__BB29_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r14, %r17, %r3;
-; SM90-NEXT:    or.b32 %r15, %r17, %r4;
+; SM90-NEXT:    or.b32 %r14, %r16, %r3;
+; SM90-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM90-NEXT:    @%p1 bra $L__BB98_3;
+; SM90-NEXT:    @%p1 bra $L__BB29_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB98_1 Depth=1
+; SM90-NEXT:    // in Loop: Header=BB29_1 Depth=1
 ; SM90-NEXT:    and.b32 %r6, %r5, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM90-NEXT:    mov.b32 %r17, %r6;
-; SM90-NEXT:    @%p2 bra $L__BB98_1;
-; SM90-NEXT:  $L__BB98_3: // %partword.cmpxchg.end
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM90-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM90-NEXT:    mov.b32 %r16, %r6;
+; SM90-NEXT:    @%p2 bra $L__BB29_1;
+; SM90-NEXT:  $L__BB29_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst
-    %oldvalue = extractvalue { i16, i1 } %pairold, 0
-    ret i16 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst
+    ret i16 %new
 }
 
-define i32 @strong_monotonic_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: strong_monotonic_monotonic_i32_global_cta(
+define i32 @monotonic_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_monotonic_i32_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b32 %r<4>;
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [strong_monotonic_monotonic_i32_global_cta_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [strong_monotonic_monotonic_i32_global_cta_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [strong_monotonic_monotonic_i32_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_global_cta_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_global_cta_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_global_cta_param_2];
 ; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic
+    ret i32 %new
 }
 
-define i32 @strong_monotonic_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: strong_monotonic_acquire_i32_global_cta(
+define i32 @monotonic_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_acquire_i32_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b32 %r<4>;
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [strong_monotonic_acquire_i32_global_cta_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [strong_monotonic_acquire_i32_global_cta_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [strong_monotonic_acquire_i32_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_global_cta_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_global_cta_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_global_cta_param_2];
 ; SM90-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire
+    ret i32 %new
 }
 
-define i32 @strong_monotonic_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: strong_monotonic_seq_cst_i32_global_cta(
+define i32 @monotonic_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_seq_cst_i32_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b32 %r<4>;
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [strong_monotonic_seq_cst_i32_global_cta_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_cta_param_0];
 ; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b32 %r1, [strong_monotonic_seq_cst_i32_global_cta_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [strong_monotonic_seq_cst_i32_global_cta_param_2];
+; SM90-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_global_cta_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_global_cta_param_2];
 ; SM90-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst
+    ret i32 %new
 }
 
-define i32 @strong_acquire_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: strong_acquire_monotonic_i32_global_cta(
+define i32 @acquire_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_monotonic_i32_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b32 %r<4>;
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [strong_acquire_monotonic_i32_global_cta_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [strong_acquire_monotonic_i32_global_cta_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [strong_acquire_monotonic_i32_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_global_cta_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_global_cta_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_global_cta_param_2];
 ; SM90-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic
+    ret i32 %new
 }
 
-define i32 @strong_acquire_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: strong_acquire_acquire_i32_global_cta(
+define i32 @acquire_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_acquire_i32_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b32 %r<4>;
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [strong_acquire_acquire_i32_global_cta_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [strong_acquire_acquire_i32_global_cta_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [strong_acquire_acquire_i32_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_global_cta_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_global_cta_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_global_cta_param_2];
 ; SM90-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire
+    ret i32 %new
 }
 
-define i32 @strong_acquire_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: strong_acquire_seq_cst_i32_global_cta(
+define i32 @acquire_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_seq_cst_i32_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b32 %r<4>;
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [strong_acquire_seq_cst_i32_global_cta_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_global_cta_param_0];
 ; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b32 %r1, [strong_acquire_seq_cst_i32_global_cta_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [strong_acquire_seq_cst_i32_global_cta_param_2];
+; SM90-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_global_cta_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_global_cta_param_2];
 ; SM90-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst
+    ret i32 %new
 }
 
-define i32 @strong_release_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: strong_release_monotonic_i32_global_cta(
+define i32 @release_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_monotonic_i32_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b32 %r<4>;
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [strong_release_monotonic_i32_global_cta_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [strong_release_monotonic_i32_global_cta_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [strong_release_monotonic_i32_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_global_cta_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_global_cta_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_global_cta_param_2];
 ; SM90-NEXT:    atom.release.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic
+    ret i32 %new
 }
 
-define i32 @strong_release_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: strong_release_acquire_i32_global_cta(
+define i32 @release_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_acquire_i32_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b32 %r<4>;
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [strong_release_acquire_i32_global_cta_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [strong_release_acquire_i32_global_cta_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [strong_release_acquire_i32_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_global_cta_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [release_acquire_i32_global_cta_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [release_acquire_i32_global_cta_param_2];
 ; SM90-NEXT:    atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release acquire
+    ret i32 %new
 }
 
-define i32 @strong_release_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: strong_release_seq_cst_i32_global_cta(
+define i32 @release_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_seq_cst_i32_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b32 %r<4>;
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [strong_release_seq_cst_i32_global_cta_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_global_cta_param_0];
 ; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b32 %r1, [strong_release_seq_cst_i32_global_cta_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [strong_release_seq_cst_i32_global_cta_param_2];
+; SM90-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_global_cta_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_global_cta_param_2];
 ; SM90-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst
+    ret i32 %new
 }
 
-define i32 @strong_acq_rel_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: strong_acq_rel_monotonic_i32_global_cta(
+define i32 @acq_rel_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_monotonic_i32_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b32 %r<4>;
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [strong_acq_rel_monotonic_i32_global_cta_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [strong_acq_rel_monotonic_i32_global_cta_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [strong_acq_rel_monotonic_i32_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_cta_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_global_cta_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_global_cta_param_2];
 ; SM90-NEXT:    atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic
+    ret i32 %new
 }
 
-define i32 @strong_acq_rel_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: strong_acq_rel_acquire_i32_global_cta(
+define i32 @acq_rel_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_acquire_i32_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b32 %r<4>;
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [strong_acq_rel_acquire_i32_global_cta_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [strong_acq_rel_acquire_i32_global_cta_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [strong_acq_rel_acquire_i32_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_global_cta_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_global_cta_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_global_cta_param_2];
 ; SM90-NEXT:    atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
+    ret i32 %new
 }
 
-define i32 @strong_acq_rel_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: strong_acq_rel_seq_cst_i32_global_cta(
+define i32 @acq_rel_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i32_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b32 %r<4>;
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [strong_acq_rel_seq_cst_i32_global_cta_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_cta_param_0];
 ; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b32 %r1, [strong_acq_rel_seq_cst_i32_global_cta_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [strong_acq_rel_seq_cst_i32_global_cta_param_2];
+; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_cta_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_cta_param_2];
 ; SM90-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst
+    ret i32 %new
 }
 
-define i32 @strong_seq_cst_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: strong_seq_cst_monotonic_i32_global_cta(
+define i32 @seq_cst_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_monotonic_i32_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b32 %r<4>;
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [strong_seq_cst_monotonic_i32_global_cta_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_cta_param_0];
 ; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b32 %r1, [strong_seq_cst_monotonic_i32_global_cta_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [strong_seq_cst_monotonic_i32_global_cta_param_2];
+; SM90-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_global_cta_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_global_cta_param_2];
 ; SM90-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic
+    ret i32 %new
 }
 
-define i32 @strong_seq_cst_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: strong_seq_cst_acquire_i32_global_cta(
+define i32 @seq_cst_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_acquire_i32_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b32 %r<4>;
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [strong_seq_cst_acquire_i32_global_cta_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_global_cta_param_0];
 ; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b32 %r1, [strong_seq_cst_acquire_i32_global_cta_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [strong_seq_cst_acquire_i32_global_cta_param_2];
+; SM90-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_global_cta_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_global_cta_param_2];
 ; SM90-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire
+    ret i32 %new
 }
 
-define i32 @strong_seq_cst_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: strong_seq_cst_seq_cst_i32_global_cta(
+define i32 @seq_cst_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i32_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b32 %r<4>;
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [strong_seq_cst_seq_cst_i32_global_cta_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_cta_param_0];
 ; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b32 %r1, [strong_seq_cst_seq_cst_i32_global_cta_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [strong_seq_cst_seq_cst_i32_global_cta_param_2];
+; SM90-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_cta_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_cta_param_2];
 ; SM90-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst
+    ret i32 %new
 }
 
-define i64 @strong_monotonic_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: strong_monotonic_monotonic_i64_global_cta(
+define i64 @monotonic_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_monotonic_i64_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [strong_monotonic_monotonic_i64_global_cta_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [strong_monotonic_monotonic_i64_global_cta_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [strong_monotonic_monotonic_i64_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_global_cta_param_0];
+; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_global_cta_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_global_cta_param_2];
 ; SM90-NEXT:    atom.relaxed.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd4;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic
+    ret i64 %new
 }
 
-define i64 @strong_monotonic_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: strong_monotonic_acquire_i64_global_cta(
+define i64 @monotonic_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_acquire_i64_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [strong_monotonic_acquire_i64_global_cta_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [strong_monotonic_acquire_i64_global_cta_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [strong_monotonic_acquire_i64_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_global_cta_param_0];
+; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_global_cta_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_global_cta_param_2];
 ; SM90-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd4;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire
+    ret i64 %new
 }
 
-define i64 @strong_monotonic_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: strong_monotonic_seq_cst_i64_global_cta(
+define i64 @monotonic_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_seq_cst_i64_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [strong_monotonic_seq_cst_i64_global_cta_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_cta_param_0];
 ; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b64 %rd2, [strong_monotonic_seq_cst_i64_global_cta_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [strong_monotonic_seq_cst_i64_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_cta_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_cta_param_2];
 ; SM90-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd4;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst
+    ret i64 %new
 }
 
-define i64 @strong_acquire_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: strong_acquire_monotonic_i64_global_cta(
+define i64 @acquire_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_monotonic_i64_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [strong_acquire_monotonic_i64_global_cta_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [strong_acquire_monotonic_i64_global_cta_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [strong_acquire_monotonic_i64_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_global_cta_param_0];
+; SM90-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_global_cta_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_global_cta_param_2];
 ; SM90-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd4;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic
+    ret i64 %new
 }
 
-define i64 @strong_acquire_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: strong_acquire_acquire_i64_global_cta(
+define i64 @acquire_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_acquire_i64_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [strong_acquire_acquire_i64_global_cta_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [strong_acquire_acquire_i64_global_cta_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [strong_acquire_acquire_i64_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_global_cta_param_0];
+; SM90-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_global_cta_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_global_cta_param_2];
 ; SM90-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd4;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire
+    ret i64 %new
 }
 
-define i64 @strong_acquire_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: strong_acquire_seq_cst_i64_global_cta(
+define i64 @acquire_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_seq_cst_i64_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [strong_acquire_seq_cst_i64_global_cta_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_global_cta_param_0];
 ; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b64 %rd2, [strong_acquire_seq_cst_i64_global_cta_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [strong_acquire_seq_cst_i64_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_global_cta_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_global_cta_param_2];
 ; SM90-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd4;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst
+    ret i64 %new
 }
 
-define i64 @strong_release_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: strong_release_monotonic_i64_global_cta(
+define i64 @release_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_monotonic_i64_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [strong_release_monotonic_i64_global_cta_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [strong_release_monotonic_i64_global_cta_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [strong_release_monotonic_i64_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_global_cta_param_0];
+; SM90-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_global_cta_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_global_cta_param_2];
 ; SM90-NEXT:    atom.release.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd4;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic
+    ret i64 %new
 }
 
-define i64 @strong_release_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: strong_release_acquire_i64_global_cta(
+define i64 @release_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_acquire_i64_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [strong_release_acquire_i64_global_cta_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [strong_release_acquire_i64_global_cta_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [strong_release_acquire_i64_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_global_cta_param_0];
+; SM90-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_global_cta_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_global_cta_param_2];
 ; SM90-NEXT:    atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd4;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release acquire
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release acquire
+    ret i64 %new
 }
 
-define i64 @strong_release_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: strong_release_seq_cst_i64_global_cta(
+define i64 @release_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_seq_cst_i64_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [strong_release_seq_cst_i64_global_cta_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_global_cta_param_0];
 ; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b64 %rd2, [strong_release_seq_cst_i64_global_cta_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [strong_release_seq_cst_i64_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_global_cta_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_global_cta_param_2];
 ; SM90-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd4;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst
+    ret i64 %new
 }
 
-define i64 @strong_acq_rel_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: strong_acq_rel_monotonic_i64_global_cta(
+define i64 @acq_rel_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_monotonic_i64_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [strong_acq_rel_monotonic_i64_global_cta_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [strong_acq_rel_monotonic_i64_global_cta_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [strong_acq_rel_monotonic_i64_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_cta_param_0];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_cta_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_cta_param_2];
 ; SM90-NEXT:    atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd4;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic
+    ret i64 %new
 }
 
-define i64 @strong_acq_rel_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: strong_acq_rel_acquire_i64_global_cta(
+define i64 @acq_rel_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_acquire_i64_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [strong_acq_rel_acquire_i64_global_cta_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [strong_acq_rel_acquire_i64_global_cta_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [strong_acq_rel_acquire_i64_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_global_cta_param_0];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_global_cta_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_global_cta_param_2];
 ; SM90-NEXT:    atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd4;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire
+    ret i64 %new
 }
 
-define i64 @strong_acq_rel_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: strong_acq_rel_seq_cst_i64_global_cta(
+define i64 @acq_rel_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i64_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [strong_acq_rel_seq_cst_i64_global_cta_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_cta_param_0];
 ; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b64 %rd2, [strong_acq_rel_seq_cst_i64_global_cta_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [strong_acq_rel_seq_cst_i64_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_cta_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_cta_param_2];
 ; SM90-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd4;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst
+    ret i64 %new
 }
 
-define i64 @strong_seq_cst_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: strong_seq_cst_monotonic_i64_global_cta(
+define i64 @seq_cst_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_monotonic_i64_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [strong_seq_cst_monotonic_i64_global_cta_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_cta_param_0];
 ; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b64 %rd2, [strong_seq_cst_monotonic_i64_global_cta_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [strong_seq_cst_monotonic_i64_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_cta_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_cta_param_2];
 ; SM90-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd4;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic
+    ret i64 %new
 }
 
-define i64 @strong_seq_cst_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: strong_seq_cst_acquire_i64_global_cta(
+define i64 @seq_cst_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_acquire_i64_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [strong_seq_cst_acquire_i64_global_cta_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_global_cta_param_0];
 ; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b64 %rd2, [strong_seq_cst_acquire_i64_global_cta_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [strong_seq_cst_acquire_i64_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_global_cta_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_global_cta_param_2];
 ; SM90-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd4;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire
+    ret i64 %new
 }
 
-define i64 @strong_seq_cst_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: strong_seq_cst_seq_cst_i64_global_cta(
+define i64 @seq_cst_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i64_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [strong_seq_cst_seq_cst_i64_global_cta_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_cta_param_0];
 ; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b64 %rd2, [strong_seq_cst_seq_cst_i64_global_cta_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [strong_seq_cst_seq_cst_i64_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_cta_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_cta_param_2];
 ; SM90-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd4;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst
-    %oldvalue = extractvalue { i64, i1 } %pairold, 0
-    ret i64 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst
+    ret i64 %new
 }
 
-define i8 @strong_acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: strong_acq_rel_acquire_i8_global(
+define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_acquire_i8_global(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b32 %r<18>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<17>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [strong_acq_rel_acquire_i8_global_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0];
 ; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.b8 %r7, [strong_acq_rel_acquire_i8_global_param_1];
+; SM90-NEXT:    ld.param.b8 %r7, [acq_rel_acquire_i8_global_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r8, [strong_acq_rel_acquire_i8_global_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 255;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r17, %r13, %r2;
-; SM90-NEXT:  $L__BB129_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    and.b32 %r16, %r13, %r2;
+; SM90-NEXT:  $L__BB60_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r14, %r17, %r3;
-; SM90-NEXT:    or.b32 %r15, %r17, %r4;
+; SM90-NEXT:    or.b32 %r14, %r16, %r3;
+; SM90-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM90-NEXT:    atom.relaxed.sys.global.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM90-NEXT:    @%p1 bra $L__BB129_3;
+; SM90-NEXT:    @%p1 bra $L__BB60_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB129_1 Depth=1
+; SM90-NEXT:    // in Loop: Header=BB60_1 Depth=1
 ; SM90-NEXT:    and.b32 %r6, %r5, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM90-NEXT:    mov.b32 %r17, %r6;
-; SM90-NEXT:    @%p2 bra $L__BB129_1;
-; SM90-NEXT:  $L__BB129_3: // %partword.cmpxchg.end
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM90-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM90-NEXT:    mov.b32 %r16, %r6;
+; SM90-NEXT:    @%p2 bra $L__BB60_1;
+; SM90-NEXT:  $L__BB60_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire
+    ret i8 %new
 }
 
-define i32 @strong_acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: strong_acq_rel_acquire_i32_global(
+define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_acquire_i32_global(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b32 %r<4>;
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [strong_acq_rel_acquire_i32_global_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [strong_acq_rel_acquire_i32_global_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [strong_acq_rel_acquire_i32_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_global_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_global_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_global_param_2];
 ; SM90-NEXT:    atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire
+    ret i32 %new
 }
 
-define i32 @strong_acq_rel_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: strong_acq_rel_acquire_i32_global_sys(
+define i32 @acq_rel_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_acquire_i32_global_sys(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b32 %r<4>;
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [strong_acq_rel_acquire_i32_global_sys_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [strong_acq_rel_acquire_i32_global_sys_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [strong_acq_rel_acquire_i32_global_sys_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_global_sys_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_global_sys_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_global_sys_param_2];
 ; SM90-NEXT:    atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire
+    ret i32 %new
 }
 
-define i32 @strong_acq_rel_acquire_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: strong_acq_rel_acquire_i32_global_cluster(
+define i32 @acq_rel_acquire_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_acquire_i32_global_cluster(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b32 %r<4>;
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [strong_acq_rel_acquire_i32_global_cluster_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [strong_acq_rel_acquire_i32_global_cluster_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [strong_acq_rel_acquire_i32_global_cluster_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_global_cluster_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_global_cluster_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_global_cluster_param_2];
 ; SM90-NEXT:    atom.acq_rel.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel acquire
+    ret i32 %new
 }
 
-define i32 @strong_acq_rel_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: strong_acq_rel_acquire_i32_global_gpu(
+define i32 @acq_rel_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_acquire_i32_global_gpu(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b32 %r<4>;
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [strong_acq_rel_acquire_i32_global_gpu_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [strong_acq_rel_acquire_i32_global_gpu_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [strong_acq_rel_acquire_i32_global_gpu_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_global_gpu_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_global_gpu_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_global_gpu_param_2];
 ; SM90-NEXT:    atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire
+    ret i32 %new
 }
 
-define i8 @strong_acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: strong_acq_rel_acquire_i8_generic_cta(
+define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_acquire_i8_generic_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b32 %r<18>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<17>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [strong_acq_rel_acquire_i8_generic_cta_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0];
 ; SM90-NEXT:    fence.release.cta;
-; SM90-NEXT:    ld.param.b8 %r7, [strong_acq_rel_acquire_i8_generic_cta_param_1];
+; SM90-NEXT:    ld.param.b8 %r7, [acq_rel_acquire_i8_generic_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r8, [strong_acq_rel_acquire_i8_generic_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 255;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM90-NEXT:    ld.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r17, %r13, %r2;
-; SM90-NEXT:  $L__BB134_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    and.b32 %r16, %r13, %r2;
+; SM90-NEXT:  $L__BB65_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r14, %r17, %r3;
-; SM90-NEXT:    or.b32 %r15, %r17, %r4;
+; SM90-NEXT:    or.b32 %r14, %r16, %r3;
+; SM90-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM90-NEXT:    atom.relaxed.cta.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM90-NEXT:    @%p1 bra $L__BB134_3;
+; SM90-NEXT:    @%p1 bra $L__BB65_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB134_1 Depth=1
+; SM90-NEXT:    // in Loop: Header=BB65_1 Depth=1
 ; SM90-NEXT:    and.b32 %r6, %r5, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM90-NEXT:    mov.b32 %r17, %r6;
-; SM90-NEXT:    @%p2 bra $L__BB134_1;
-; SM90-NEXT:  $L__BB134_3: // %partword.cmpxchg.end
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM90-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM90-NEXT:    mov.b32 %r16, %r6;
+; SM90-NEXT:    @%p2 bra $L__BB65_1;
+; SM90-NEXT:  $L__BB65_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
+    ret i8 %new
 }
 
-define i8 @strong_acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: strong_acq_rel_acquire_i8_shared_cta(
+define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_acquire_i8_shared_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b32 %r<18>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<17>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd2, [strong_acq_rel_acquire_i8_shared_cta_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0];
 ; SM90-NEXT:    fence.release.cta;
-; SM90-NEXT:    ld.param.b8 %r7, [strong_acq_rel_acquire_i8_shared_cta_param_1];
+; SM90-NEXT:    ld.param.b8 %r7, [acq_rel_acquire_i8_shared_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r8, [strong_acq_rel_acquire_i8_shared_cta_param_2];
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    shl.b32 %r3, %r8, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 255;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
 ; SM90-NEXT:    ld.shared.b32 %r13, [%rd1];
-; SM90-NEXT:    and.b32 %r17, %r13, %r2;
-; SM90-NEXT:  $L__BB135_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    and.b32 %r16, %r13, %r2;
+; SM90-NEXT:  $L__BB66_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r14, %r17, %r3;
-; SM90-NEXT:    or.b32 %r15, %r17, %r4;
+; SM90-NEXT:    or.b32 %r14, %r16, %r3;
+; SM90-NEXT:    or.b32 %r15, %r16, %r4;
 ; SM90-NEXT:    atom.relaxed.cta.shared.cas.b32 %r5, [%rd1], %r15, %r14;
 ; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r15;
-; SM90-NEXT:    @%p1 bra $L__BB135_3;
+; SM90-NEXT:    @%p1 bra $L__BB66_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB135_1 Depth=1
+; SM90-NEXT:    // in Loop: Header=BB66_1 Depth=1
 ; SM90-NEXT:    and.b32 %r6, %r5, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM90-NEXT:    mov.b32 %r17, %r6;
-; SM90-NEXT:    @%p2 bra $L__BB135_1;
-; SM90-NEXT:  $L__BB135_3: // %partword.cmpxchg.end
-; SM90-NEXT:    shr.u32 %r16, %r5, %r1;
+; SM90-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM90-NEXT:    mov.b32 %r16, %r6;
+; SM90-NEXT:    @%p2 bra $L__BB66_1;
+; SM90-NEXT:  $L__BB66_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r16;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
-    %oldvalue = extractvalue { i8, i1 } %pairold, 0
-    ret i8 %oldvalue
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
+    ret i8 %new
 }
 
-define i32 @strong_acq_rel_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: strong_acq_rel_acquire_i32_generic_cta(
+define i32 @acq_rel_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_acquire_i32_generic_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b32 %r<4>;
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [strong_acq_rel_acquire_i32_generic_cta_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [strong_acq_rel_acquire_i32_generic_cta_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [strong_acq_rel_acquire_i32_generic_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_cta_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_generic_cta_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_generic_cta_param_2];
 ; SM90-NEXT:    atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
+    ret i32 %new
 }
 
-define i32 @strong_acq_rel_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: strong_acq_rel_acquire_i32_shared_cta(
+define i32 @acq_rel_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_acquire_i32_shared_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b32 %r<4>;
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [strong_acq_rel_acquire_i32_shared_cta_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [strong_acq_rel_acquire_i32_shared_cta_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [strong_acq_rel_acquire_i32_shared_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_cta_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_shared_cta_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_shared_cta_param_2];
 ; SM90-NEXT:    atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg  ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
-    %oldvalue = extractvalue { i32, i1 } %pairold, 0
-    ret i32 %oldvalue
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
+    ret i32 %new
 }
 

diff  --git a/llvm/test/CodeGen/NVPTX/cmpxchg.py b/llvm/test/CodeGen/NVPTX/cmpxchg.py
index 074662b764bfe..75623a59ad481 100644
--- a/llvm/test/CodeGen/NVPTX/cmpxchg.py
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg.py
@@ -5,19 +5,17 @@
 from itertools import product
 
 cmpxchg_func = Template(
-    """define i$size @${strength}_${success}_${failure}_i${size}_${addrspace}_${ptx_scope}(ptr${addrspace_cast} %addr, i$size %cmp, i$size %new) {
-    %pairold = cmpxchg ${weak} ptr${addrspace_cast} %addr, i$size %cmp, i$size %new syncscope(\"${llvm_scope}\") $success $failure
-    %oldvalue = extractvalue { i$size, i1 } %pairold, 0
-    ret i$size %oldvalue
+    """define i$size @${success}_${failure}_i${size}_${addrspace}_${ptx_scope}(ptr${addrspace_cast} %addr, i$size %cmp, i$size %new) {
+    %pairold = cmpxchg ptr${addrspace_cast} %addr, i$size %cmp, i$size %new syncscope(\"${llvm_scope}\") $success $failure
+    ret i$size %new
 }
 """
 )
 
 cmpxchg_func_no_scope = Template(
-    """define i$size @${strength}_${success}_${failure}_i${size}_${addrspace}(ptr${addrspace_cast} %addr, i$size %cmp, i$size %new) {
-    %pairold = cmpxchg ${weak} ptr${addrspace_cast} %addr, i$size %cmp, i$size %new $success $failure
-    %oldvalue = extractvalue { i$size, i1 } %pairold, 0
-    ret i$size %oldvalue
+    """define i$size @${success}_${failure}_i${size}_${addrspace}(ptr${addrspace_cast} %addr, i$size %cmp, i$size %new) {
+    %pairold = cmpxchg ptr${addrspace_cast} %addr, i$size %cmp, i$size %new $success $failure
+    ret i$size %new
 }
 """
 )
@@ -37,7 +35,6 @@ def get_addrspace_cast(addrspace):
 
 
 TESTS = [(60, 50), (70, 63), (90, 87)]
-# We don't include (100, 90) because the codegen is identical to (90, 87)
 
 LLVM_SCOPES = ["", "block", "cluster", "device"]
 
@@ -47,100 +44,90 @@ def get_addrspace_cast(addrspace):
 
 FAILURE_ORDERINGS = ["monotonic", "acquire", "seq_cst"]
 
-STRENGTHS = ["weak", "strong"]
-
 SIZES = [8, 16, 32, 64]
 
 ADDRSPACES = [0, 1, 3]
 
 ADDRSPACE_NUM_TO_ADDRSPACE = {0: "generic", 1: "global", 3: "shared"}
 
+
 if __name__ == "__main__":
     for sm, ptx in TESTS:
         with open("cmpxchg-sm{}.ll".format(str(sm)), "w") as fp:
             print(run_statement.substitute(sm=sm, ptx=ptx), file=fp)
-            # Test weak and strong cmpxchg for all slices
-            for strength in STRENGTHS:
-                # Our test space is: SIZES X SUCCESS_ORDERINGS X FAILURE_ORDERINGS X ADDRSPACES X LLVM_SCOPES
-                # This is very large, so we instead test 3 slices.
-
-                # First slice:  are all orderings correctly supported, with and without emulation loops?
-                # set addrspace to global, scope to cta, generate all possible orderings, for all operation sizes
-                addrspace, llvm_scope = 1, "block"
-                for size, success, failure in product(
-                    SIZES, SUCCESS_ORDERINGS, FAILURE_ORDERINGS
-                ):
-                    print(
-                        cmpxchg_func.substitute(
-                            success=success,
-                            failure=failure,
-                            size=size,
-                            addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace],
-                            addrspace_cast=get_addrspace_cast(addrspace),
-                            llvm_scope=llvm_scope,
-                            ptx_scope=SCOPE_LLVM_TO_PTX[llvm_scope],
-                            strength=strength,
-                            weak="weak" if strength == "weak" else "",
-                        ),
-                        file=fp,
-                    )
-
-                # Second slice: Are all scopes correctly supported, with and without emulation loops?
-                # fix addrspace, ordering, generate all possible scopes, for operation sizes i8, i32
-                addrspace, success, failure = 1, "acq_rel", "acquire"
-                for size in [8, 32]:
-                    print(
-                        cmpxchg_func_no_scope.substitute(
-                            success=success,
-                            failure=failure,
-                            size=size,
-                            addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace],
-                            addrspace_cast=get_addrspace_cast(addrspace),
-                            strength=strength,
-                            weak="weak" if strength == "weak" else "",
-                        ),
-                        file=fp,
-                    )
-
-                for llvm_scope in LLVM_SCOPES:
-                    if sm < 90 and llvm_scope == "cluster":
-                        continue
-                    if llvm_scope == "block":
-                        # skip (acq_rel, acquire, global, cta)
-                        continue
-                    print(
-                        cmpxchg_func.substitute(
-                            success=success,
-                            failure=failure,
-                            size=size,
-                            addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace],
-                            addrspace_cast=get_addrspace_cast(addrspace),
-                            llvm_scope=llvm_scope,
-                            ptx_scope=SCOPE_LLVM_TO_PTX[llvm_scope],
-                            strength=strength,
-                            weak="weak" if strength == "weak" else "",
-                        ),
-                        file=fp,
-                    )
-
-                # Third slice: Are all address spaces correctly supported?
-                # fix ordering, scope, generate all possible address spaces, for operation sizes i8, i32
-                success, failure, llvm_scope = "acq_rel", "acquire", "block"
-                for size, addrspace in product([8, 32], ADDRSPACES):
-                    if addrspace == 1:
-                        # skip (acq_rel, acquire, global, cta)
-                        continue
-                    print(
-                        cmpxchg_func.substitute(
-                            success=success,
-                            failure=failure,
-                            size=size,
-                            addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace],
-                            addrspace_cast=get_addrspace_cast(addrspace),
-                            llvm_scope=llvm_scope,
-                            ptx_scope=SCOPE_LLVM_TO_PTX[llvm_scope],
-                            strength=strength,
-                            weak="weak" if strength == "weak" else "",
-                        ),
-                        file=fp,
-                    )
+
+            # Our test space is: SIZES X SUCCESS_ORDERINGS X FAILURE_ORDERINGS X ADDRSPACES X LLVM_SCOPES
+            # This is very large, so we instead test 3 slices.
+
+            # First slice:  are all orderings correctly supported, with and without emulation loops?
+            # set addrspace to global, scope to cta, generate all possible orderings, for all operation sizes
+            addrspace, llvm_scope = 1, "block"
+            for size, success, failure in product(
+                SIZES, SUCCESS_ORDERINGS, FAILURE_ORDERINGS
+            ):
+                print(
+                    cmpxchg_func.substitute(
+                        success=success,
+                        failure=failure,
+                        size=size,
+                        addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace],
+                        addrspace_cast=get_addrspace_cast(addrspace),
+                        llvm_scope=llvm_scope,
+                        ptx_scope=SCOPE_LLVM_TO_PTX[llvm_scope],
+                    ),
+                    file=fp,
+                )
+
+            # Second slice: Are all scopes correctlly supported, with and without emulation loops?
+            # fix addrspace, ordering, generate all possible scopes, for operation sizes i8, i32
+            addrspace, success, failure = 1, "acq_rel", "acquire"
+            for size in [8, 32]:
+                print(
+                    cmpxchg_func_no_scope.substitute(
+                        success=success,
+                        failure=failure,
+                        size=size,
+                        addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace],
+                        addrspace_cast=get_addrspace_cast(addrspace),
+                    ),
+                    file=fp,
+                )
+
+            for llvm_scope in LLVM_SCOPES:
+                if sm < 90 and llvm_scope == "cluster":
+                    continue
+                if llvm_scope == "block":
+                    # skip (acq_rel, acquire, global, cta)
+                    continue
+                print(
+                    cmpxchg_func.substitute(
+                        success=success,
+                        failure=failure,
+                        size=size,
+                        addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace],
+                        addrspace_cast=get_addrspace_cast(addrspace),
+                        llvm_scope=llvm_scope,
+                        ptx_scope=SCOPE_LLVM_TO_PTX[llvm_scope],
+                    ),
+                    file=fp,
+                )
+
+            # Third slice: Are all address spaces correctly supported?
+            # fix ordering, scope, generate all possible address spaces, for operation sizes i8, i32
+            success, failure, llvm_scope = "acq_rel", "acquire", "block"
+            for size, addrspace in product([8, 32], ADDRSPACES):
+                if addrspace == 1:
+                    # skip (acq_rel, acquire, global, cta)
+                    continue
+                print(
+                    cmpxchg_func.substitute(
+                        success=success,
+                        failure=failure,
+                        size=size,
+                        addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace],
+                        addrspace_cast=get_addrspace_cast(addrspace),
+                        llvm_scope=llvm_scope,
+                        ptx_scope=SCOPE_LLVM_TO_PTX[llvm_scope],
+                    ),
+                    file=fp,
+                )


        


More information about the llvm-branch-commits mailing list