[llvm] [NVPTX] Skip numbering unreferenced virtual registers (readability) (PR #154391)

Alex MacLean via llvm-commits llvm-commits at lists.llvm.org
Tue Aug 19 10:15:05 PDT 2025


https://github.com/AlexMaclean created https://github.com/llvm/llvm-project/pull/154391

When assigning numbers to registers, skip any with neither uses nor defs. This is will not have any impact at all on the final SASS but it makes for slightly more readable and consistent across minor changes PTX. 

>From 1dcc585a783e2858af3290b379a16f0bcaf1ae80 Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean at nvidia.com>
Date: Tue, 19 Aug 2025 17:14:15 +0000
Subject: [PATCH] [NVPTX] Skip numbering unreferenced virtual registers
 (readability)

---
 llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp     |   15 +-
 llvm/test/CodeGen/NVPTX/aggregate-return.ll   |   20 +-
 llvm/test/CodeGen/NVPTX/atomics-sm70.ll       |  124 +-
 llvm/test/CodeGen/NVPTX/atomics-sm90.ll       |  124 +-
 llvm/test/CodeGen/NVPTX/atomics.ll            |   46 +-
 .../test/CodeGen/NVPTX/bf16x2-instructions.ll |    2 +-
 .../test/CodeGen/NVPTX/byval-arg-vectorize.ll |    2 +-
 llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll       | 1422 ++++++++---------
 llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll       | 1422 ++++++++---------
 llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll       | 1422 ++++++++---------
 llvm/test/CodeGen/NVPTX/cmpxchg.ll            |  860 +++++-----
 llvm/test/CodeGen/NVPTX/combine-mad.ll        |    2 +-
 .../CodeGen/NVPTX/convert-call-to-indirect.ll |  124 +-
 llvm/test/CodeGen/NVPTX/cse-mov-sym.ll        |   24 +-
 .../NVPTX/distributed-shared-cluster.ll       |  116 +-
 llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll |    4 +-
 llvm/test/CodeGen/NVPTX/extractelement.ll     |    8 +-
 llvm/test/CodeGen/NVPTX/f16x2-instructions.ll |    6 +-
 llvm/test/CodeGen/NVPTX/f32x2-instructions.ll |    6 +-
 llvm/test/CodeGen/NVPTX/fma.ll                |    4 +-
 llvm/test/CodeGen/NVPTX/forward-ld-param.ll   |   11 +-
 llvm/test/CodeGen/NVPTX/i1-select.ll          |   38 +-
 llvm/test/CodeGen/NVPTX/i128-array.ll         |   10 +-
 llvm/test/CodeGen/NVPTX/i128.ll               |  762 ++++-----
 llvm/test/CodeGen/NVPTX/i16x2-instructions.ll |    6 +-
 llvm/test/CodeGen/NVPTX/i8x2-instructions.ll  |    4 +-
 llvm/test/CodeGen/NVPTX/i8x4-instructions.ll  |   12 +-
 llvm/test/CodeGen/NVPTX/indirect_byval.ll     |   18 +-
 .../CodeGen/NVPTX/inline-asm-b128-test3.ll    |   20 +-
 llvm/test/CodeGen/NVPTX/jump-table.ll         |    8 +-
 llvm/test/CodeGen/NVPTX/ld-param-sink.ll      |    6 +-
 llvm/test/CodeGen/NVPTX/ldparam-v4.ll         |    2 +-
 .../NVPTX/load-with-non-coherent-cache.ll     |   56 +-
 llvm/test/CodeGen/NVPTX/local-stack-frame.ll  |   30 +-
 .../CodeGen/NVPTX/lower-args-gridconstant.ll  |   24 +-
 llvm/test/CodeGen/NVPTX/lower-byval-args.ll   |  145 +-
 llvm/test/CodeGen/NVPTX/misched_func_call.ll  |   22 +-
 llvm/test/CodeGen/NVPTX/param-add.ll          |    2 +-
 llvm/test/CodeGen/NVPTX/param-overalign.ll    |    4 +-
 llvm/test/CodeGen/NVPTX/surf-read-cuda.ll     |    2 +-
 llvm/test/CodeGen/NVPTX/surf-write-cuda.ll    |    1 -
 llvm/test/CodeGen/NVPTX/tex-read-cuda.ll      |    8 +-
 llvm/test/CodeGen/NVPTX/texsurf-queries.ll    |    4 -
 .../NVPTX/unaligned-param-load-store.ll       |  234 +--
 llvm/test/CodeGen/NVPTX/variadics-backend.ll  |   96 +-
 45 files changed, 3634 insertions(+), 3644 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 38912a7f09e30..077fb56910c07 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -1458,7 +1458,6 @@ void NVPTXAsmPrinter::setAndEmitFunctionVirtualRegisters(
   // Map the global virtual register number to a register class specific
   // virtual register number starting from 1 with that class.
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
-  //unsigned numRegClasses = TRI->getNumRegClasses();
 
   // Emit the Fake Stack Object
   const MachineFrameInfo &MFI = MF.getFrameInfo();
@@ -1479,13 +1478,13 @@ void NVPTXAsmPrinter::setAndEmitFunctionVirtualRegisters(
   // global virtual
   // register number and the per class virtual register number.
   // We use the per class virtual register number in the ptx output.
-  unsigned int numVRs = MRI->getNumVirtRegs();
-  for (unsigned i = 0; i < numVRs; i++) {
-    Register vr = Register::index2VirtReg(i);
-    const TargetRegisterClass *RC = MRI->getRegClass(vr);
-    DenseMap<unsigned, unsigned> &regmap = VRegMapping[RC];
-    int n = regmap.size();
-    regmap.insert(std::make_pair(vr, n + 1));
+  for (unsigned I : llvm::seq(MRI->getNumVirtRegs())) {
+    Register VR = Register::index2VirtReg(I);
+    if (MRI->use_empty(VR) && MRI->def_empty(VR))
+      continue;
+    const TargetRegisterClass *RC = MRI->getRegClass(VR);
+    DenseMap<unsigned, unsigned> &RCRegMap = VRegMapping[RC];
+    RCRegMap.insert(std::make_pair(VR, RCRegMap.size() + 1));
   }
 
   // Emit declaration of the virtual registers or 'physical' registers for
diff --git a/llvm/test/CodeGen/NVPTX/aggregate-return.ll b/llvm/test/CodeGen/NVPTX/aggregate-return.ll
index abc873e2aa706..bf51973e88357 100644
--- a/llvm/test/CodeGen/NVPTX/aggregate-return.ll
+++ b/llvm/test/CodeGen/NVPTX/aggregate-return.ll
@@ -10,7 +10,7 @@ declare {float, float} @bars({float, float} %input)
 define void @test_v2f32(<2 x float> %input, ptr %output) {
 ; CHECK-LABEL: test_v2f32(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [test_v2f32_param_0];
@@ -21,8 +21,8 @@ define void @test_v2f32(<2 x float> %input, ptr %output) {
 ; CHECK-NEXT:    call.uni (retval0), barv, (param0);
 ; CHECK-NEXT:    ld.param.b64 %rd2, [retval0];
 ; CHECK-NEXT:    } // callseq 0
-; CHECK-NEXT:    ld.param.b64 %rd4, [test_v2f32_param_1];
-; CHECK-NEXT:    st.b64 [%rd4], %rd2;
+; CHECK-NEXT:    ld.param.b64 %rd3, [test_v2f32_param_1];
+; CHECK-NEXT:    st.b64 [%rd3], %rd2;
 ; CHECK-NEXT:    ret;
   %call = tail call <2 x float> @barv(<2 x float> %input)
   store <2 x float> %call, ptr %output, align 8
@@ -32,8 +32,8 @@ define void @test_v2f32(<2 x float> %input, ptr %output) {
 define void @test_v3f32(<3 x float> %input, ptr %output) {
 ; CHECK-LABEL: test_v3f32(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<4>;
-; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [test_v3f32_param_0];
@@ -47,9 +47,9 @@ define void @test_v3f32(<3 x float> %input, ptr %output) {
 ; CHECK-NEXT:    ld.param.b32 %r2, [retval0+8];
 ; CHECK-NEXT:    ld.param.b64 %rd2, [retval0];
 ; CHECK-NEXT:    } // callseq 1
-; CHECK-NEXT:    ld.param.b64 %rd4, [test_v3f32_param_1];
-; CHECK-NEXT:    st.b32 [%rd4+8], %r2;
-; CHECK-NEXT:    st.b64 [%rd4], %rd2;
+; CHECK-NEXT:    ld.param.b64 %rd3, [test_v3f32_param_1];
+; CHECK-NEXT:    st.b32 [%rd3+8], %r2;
+; CHECK-NEXT:    st.b64 [%rd3], %rd2;
 ; CHECK-NEXT:    ret;
   %call = tail call <3 x float> @barv3(<3 x float> %input)
 ; Make sure we don't load more values than than we need to.
@@ -60,7 +60,7 @@ define void @test_v3f32(<3 x float> %input, ptr %output) {
 define void @test_a2f32([2 x float] %input, ptr %output) {
 ; CHECK-LABEL: test_a2f32(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
@@ -87,7 +87,7 @@ define void @test_a2f32([2 x float] %input, ptr %output) {
 define void @test_s2f32({float, float} %input, ptr %output) {
 ; CHECK-LABEL: test_s2f32(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll
index f710d7f883a1b..5f4856acb317c 100644
--- a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll
@@ -47,90 +47,90 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
 ; CHECKPTX62:       {
 ; CHECKPTX62-NEXT:    .reg .pred %p<5>;
 ; CHECKPTX62-NEXT:    .reg .b16 %rs<11>;
-; CHECKPTX62-NEXT:    .reg .b32 %r<58>;
+; CHECKPTX62-NEXT:    .reg .b32 %r<50>;
 ; CHECKPTX62-EMPTY:
 ; CHECKPTX62-NEXT:  // %bb.0:
 ; CHECKPTX62-NEXT:    ld.param.b16 %rs1, [test_param_3];
-; CHECKPTX62-NEXT:    ld.param.b32 %r23, [test_param_2];
-; CHECKPTX62-NEXT:    ld.param.b32 %r22, [test_param_1];
-; CHECKPTX62-NEXT:    ld.param.b32 %r24, [test_param_0];
-; CHECKPTX62-NEXT:    and.b32 %r1, %r24, -4;
-; CHECKPTX62-NEXT:    and.b32 %r25, %r24, 3;
-; CHECKPTX62-NEXT:    shl.b32 %r2, %r25, 3;
-; CHECKPTX62-NEXT:    mov.b32 %r26, 65535;
-; CHECKPTX62-NEXT:    shl.b32 %r27, %r26, %r2;
-; CHECKPTX62-NEXT:    not.b32 %r3, %r27;
-; CHECKPTX62-NEXT:    ld.b32 %r54, [%r1];
+; CHECKPTX62-NEXT:    ld.param.b32 %r15, [test_param_2];
+; CHECKPTX62-NEXT:    ld.param.b32 %r14, [test_param_1];
+; CHECKPTX62-NEXT:    ld.param.b32 %r16, [test_param_0];
+; CHECKPTX62-NEXT:    and.b32 %r1, %r16, -4;
+; CHECKPTX62-NEXT:    and.b32 %r17, %r16, 3;
+; CHECKPTX62-NEXT:    shl.b32 %r2, %r17, 3;
+; CHECKPTX62-NEXT:    mov.b32 %r18, 65535;
+; CHECKPTX62-NEXT:    shl.b32 %r19, %r18, %r2;
+; CHECKPTX62-NEXT:    not.b32 %r3, %r19;
+; CHECKPTX62-NEXT:    ld.b32 %r46, [%r1];
 ; CHECKPTX62-NEXT:  $L__BB0_1: // %atomicrmw.start45
 ; CHECKPTX62-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECKPTX62-NEXT:    shr.u32 %r28, %r54, %r2;
-; CHECKPTX62-NEXT:    cvt.u16.u32 %rs2, %r28;
+; CHECKPTX62-NEXT:    shr.u32 %r20, %r46, %r2;
+; CHECKPTX62-NEXT:    cvt.u16.u32 %rs2, %r20;
 ; CHECKPTX62-NEXT:    add.rn.f16 %rs3, %rs2, %rs1;
-; CHECKPTX62-NEXT:    cvt.u32.u16 %r29, %rs3;
-; CHECKPTX62-NEXT:    shl.b32 %r30, %r29, %r2;
-; CHECKPTX62-NEXT:    and.b32 %r31, %r54, %r3;
-; CHECKPTX62-NEXT:    or.b32 %r32, %r31, %r30;
-; CHECKPTX62-NEXT:    atom.relaxed.sys.cas.b32 %r6, [%r1], %r54, %r32;
-; CHECKPTX62-NEXT:    setp.ne.b32 %p1, %r6, %r54;
-; CHECKPTX62-NEXT:    mov.b32 %r54, %r6;
+; CHECKPTX62-NEXT:    cvt.u32.u16 %r21, %rs3;
+; CHECKPTX62-NEXT:    shl.b32 %r22, %r21, %r2;
+; CHECKPTX62-NEXT:    and.b32 %r23, %r46, %r3;
+; CHECKPTX62-NEXT:    or.b32 %r24, %r23, %r22;
+; CHECKPTX62-NEXT:    atom.relaxed.sys.cas.b32 %r4, [%r1], %r46, %r24;
+; CHECKPTX62-NEXT:    setp.ne.b32 %p1, %r4, %r46;
+; CHECKPTX62-NEXT:    mov.b32 %r46, %r4;
 ; CHECKPTX62-NEXT:    @%p1 bra $L__BB0_1;
 ; CHECKPTX62-NEXT:  // %bb.2: // %atomicrmw.end44
-; CHECKPTX62-NEXT:    ld.b32 %r55, [%r1];
+; CHECKPTX62-NEXT:    ld.b32 %r47, [%r1];
 ; CHECKPTX62-NEXT:  $L__BB0_3: // %atomicrmw.start27
 ; CHECKPTX62-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECKPTX62-NEXT:    shr.u32 %r33, %r55, %r2;
-; CHECKPTX62-NEXT:    cvt.u16.u32 %rs4, %r33;
+; CHECKPTX62-NEXT:    shr.u32 %r25, %r47, %r2;
+; CHECKPTX62-NEXT:    cvt.u16.u32 %rs4, %r25;
 ; CHECKPTX62-NEXT:    mov.b16 %rs5, 0x3C00;
 ; CHECKPTX62-NEXT:    add.rn.f16 %rs6, %rs4, %rs5;
-; CHECKPTX62-NEXT:    cvt.u32.u16 %r34, %rs6;
-; CHECKPTX62-NEXT:    shl.b32 %r35, %r34, %r2;
-; CHECKPTX62-NEXT:    and.b32 %r36, %r55, %r3;
-; CHECKPTX62-NEXT:    or.b32 %r37, %r36, %r35;
-; CHECKPTX62-NEXT:    atom.relaxed.sys.cas.b32 %r9, [%r1], %r55, %r37;
-; CHECKPTX62-NEXT:    setp.ne.b32 %p2, %r9, %r55;
-; CHECKPTX62-NEXT:    mov.b32 %r55, %r9;
+; CHECKPTX62-NEXT:    cvt.u32.u16 %r26, %rs6;
+; CHECKPTX62-NEXT:    shl.b32 %r27, %r26, %r2;
+; CHECKPTX62-NEXT:    and.b32 %r28, %r47, %r3;
+; CHECKPTX62-NEXT:    or.b32 %r29, %r28, %r27;
+; CHECKPTX62-NEXT:    atom.relaxed.sys.cas.b32 %r5, [%r1], %r47, %r29;
+; CHECKPTX62-NEXT:    setp.ne.b32 %p2, %r5, %r47;
+; CHECKPTX62-NEXT:    mov.b32 %r47, %r5;
 ; CHECKPTX62-NEXT:    @%p2 bra $L__BB0_3;
 ; CHECKPTX62-NEXT:  // %bb.4: // %atomicrmw.end26
-; CHECKPTX62-NEXT:    and.b32 %r10, %r22, -4;
-; CHECKPTX62-NEXT:    shl.b32 %r38, %r22, 3;
-; CHECKPTX62-NEXT:    and.b32 %r11, %r38, 24;
-; CHECKPTX62-NEXT:    mov.b32 %r39, 65535;
-; CHECKPTX62-NEXT:    shl.b32 %r40, %r39, %r11;
-; CHECKPTX62-NEXT:    not.b32 %r12, %r40;
-; CHECKPTX62-NEXT:    ld.global.b32 %r56, [%r10];
+; CHECKPTX62-NEXT:    and.b32 %r6, %r14, -4;
+; CHECKPTX62-NEXT:    shl.b32 %r30, %r14, 3;
+; CHECKPTX62-NEXT:    and.b32 %r7, %r30, 24;
+; CHECKPTX62-NEXT:    mov.b32 %r31, 65535;
+; CHECKPTX62-NEXT:    shl.b32 %r32, %r31, %r7;
+; CHECKPTX62-NEXT:    not.b32 %r8, %r32;
+; CHECKPTX62-NEXT:    ld.global.b32 %r48, [%r6];
 ; CHECKPTX62-NEXT:  $L__BB0_5: // %atomicrmw.start9
 ; CHECKPTX62-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECKPTX62-NEXT:    shr.u32 %r41, %r56, %r11;
-; CHECKPTX62-NEXT:    cvt.u16.u32 %rs7, %r41;
+; CHECKPTX62-NEXT:    shr.u32 %r33, %r48, %r7;
+; CHECKPTX62-NEXT:    cvt.u16.u32 %rs7, %r33;
 ; CHECKPTX62-NEXT:    add.rn.f16 %rs8, %rs7, %rs1;
-; CHECKPTX62-NEXT:    cvt.u32.u16 %r42, %rs8;
-; CHECKPTX62-NEXT:    shl.b32 %r43, %r42, %r11;
-; CHECKPTX62-NEXT:    and.b32 %r44, %r56, %r12;
-; CHECKPTX62-NEXT:    or.b32 %r45, %r44, %r43;
-; CHECKPTX62-NEXT:    atom.relaxed.sys.global.cas.b32 %r15, [%r10], %r56, %r45;
-; CHECKPTX62-NEXT:    setp.ne.b32 %p3, %r15, %r56;
-; CHECKPTX62-NEXT:    mov.b32 %r56, %r15;
+; CHECKPTX62-NEXT:    cvt.u32.u16 %r34, %rs8;
+; CHECKPTX62-NEXT:    shl.b32 %r35, %r34, %r7;
+; CHECKPTX62-NEXT:    and.b32 %r36, %r48, %r8;
+; CHECKPTX62-NEXT:    or.b32 %r37, %r36, %r35;
+; CHECKPTX62-NEXT:    atom.relaxed.sys.global.cas.b32 %r9, [%r6], %r48, %r37;
+; CHECKPTX62-NEXT:    setp.ne.b32 %p3, %r9, %r48;
+; CHECKPTX62-NEXT:    mov.b32 %r48, %r9;
 ; CHECKPTX62-NEXT:    @%p3 bra $L__BB0_5;
 ; CHECKPTX62-NEXT:  // %bb.6: // %atomicrmw.end8
-; CHECKPTX62-NEXT:    and.b32 %r16, %r23, -4;
-; CHECKPTX62-NEXT:    shl.b32 %r46, %r23, 3;
-; CHECKPTX62-NEXT:    and.b32 %r17, %r46, 24;
-; CHECKPTX62-NEXT:    mov.b32 %r47, 65535;
-; CHECKPTX62-NEXT:    shl.b32 %r48, %r47, %r17;
-; CHECKPTX62-NEXT:    not.b32 %r18, %r48;
-; CHECKPTX62-NEXT:    ld.shared.b32 %r57, [%r16];
+; CHECKPTX62-NEXT:    and.b32 %r10, %r15, -4;
+; CHECKPTX62-NEXT:    shl.b32 %r38, %r15, 3;
+; CHECKPTX62-NEXT:    and.b32 %r11, %r38, 24;
+; CHECKPTX62-NEXT:    mov.b32 %r39, 65535;
+; CHECKPTX62-NEXT:    shl.b32 %r40, %r39, %r11;
+; CHECKPTX62-NEXT:    not.b32 %r12, %r40;
+; CHECKPTX62-NEXT:    ld.shared.b32 %r49, [%r10];
 ; CHECKPTX62-NEXT:  $L__BB0_7: // %atomicrmw.start
 ; CHECKPTX62-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECKPTX62-NEXT:    shr.u32 %r49, %r57, %r17;
-; CHECKPTX62-NEXT:    cvt.u16.u32 %rs9, %r49;
+; CHECKPTX62-NEXT:    shr.u32 %r41, %r49, %r11;
+; CHECKPTX62-NEXT:    cvt.u16.u32 %rs9, %r41;
 ; CHECKPTX62-NEXT:    add.rn.f16 %rs10, %rs9, %rs1;
-; CHECKPTX62-NEXT:    cvt.u32.u16 %r50, %rs10;
-; CHECKPTX62-NEXT:    shl.b32 %r51, %r50, %r17;
-; CHECKPTX62-NEXT:    and.b32 %r52, %r57, %r18;
-; CHECKPTX62-NEXT:    or.b32 %r53, %r52, %r51;
-; CHECKPTX62-NEXT:    atom.relaxed.sys.shared.cas.b32 %r21, [%r16], %r57, %r53;
-; CHECKPTX62-NEXT:    setp.ne.b32 %p4, %r21, %r57;
-; CHECKPTX62-NEXT:    mov.b32 %r57, %r21;
+; CHECKPTX62-NEXT:    cvt.u32.u16 %r42, %rs10;
+; CHECKPTX62-NEXT:    shl.b32 %r43, %r42, %r11;
+; CHECKPTX62-NEXT:    and.b32 %r44, %r49, %r12;
+; CHECKPTX62-NEXT:    or.b32 %r45, %r44, %r43;
+; CHECKPTX62-NEXT:    atom.relaxed.sys.shared.cas.b32 %r13, [%r10], %r49, %r45;
+; CHECKPTX62-NEXT:    setp.ne.b32 %p4, %r13, %r49;
+; CHECKPTX62-NEXT:    mov.b32 %r49, %r13;
 ; CHECKPTX62-NEXT:    @%p4 bra $L__BB0_7;
 ; CHECKPTX62-NEXT:  // %bb.8: // %atomicrmw.end
 ; CHECKPTX62-NEXT:    ret;
diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
index f96fd30019025..e560d4386c20d 100644
--- a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
@@ -47,93 +47,93 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
 ; CHECKPTX71:       {
 ; CHECKPTX71-NEXT:    .reg .pred %p<5>;
 ; CHECKPTX71-NEXT:    .reg .b16 %rs<14>;
-; CHECKPTX71-NEXT:    .reg .b32 %r<58>;
+; CHECKPTX71-NEXT:    .reg .b32 %r<50>;
 ; CHECKPTX71-EMPTY:
 ; CHECKPTX71-NEXT:  // %bb.0:
 ; CHECKPTX71-NEXT:    ld.param.b16 %rs1, [test_param_3];
-; CHECKPTX71-NEXT:    ld.param.b32 %r23, [test_param_2];
-; CHECKPTX71-NEXT:    ld.param.b32 %r22, [test_param_1];
-; CHECKPTX71-NEXT:    ld.param.b32 %r24, [test_param_0];
-; CHECKPTX71-NEXT:    and.b32 %r1, %r24, -4;
-; CHECKPTX71-NEXT:    and.b32 %r25, %r24, 3;
-; CHECKPTX71-NEXT:    shl.b32 %r2, %r25, 3;
-; CHECKPTX71-NEXT:    mov.b32 %r26, 65535;
-; CHECKPTX71-NEXT:    shl.b32 %r27, %r26, %r2;
-; CHECKPTX71-NEXT:    not.b32 %r3, %r27;
-; CHECKPTX71-NEXT:    ld.b32 %r54, [%r1];
+; CHECKPTX71-NEXT:    ld.param.b32 %r15, [test_param_2];
+; CHECKPTX71-NEXT:    ld.param.b32 %r14, [test_param_1];
+; CHECKPTX71-NEXT:    ld.param.b32 %r16, [test_param_0];
+; CHECKPTX71-NEXT:    and.b32 %r1, %r16, -4;
+; CHECKPTX71-NEXT:    and.b32 %r17, %r16, 3;
+; CHECKPTX71-NEXT:    shl.b32 %r2, %r17, 3;
+; CHECKPTX71-NEXT:    mov.b32 %r18, 65535;
+; CHECKPTX71-NEXT:    shl.b32 %r19, %r18, %r2;
+; CHECKPTX71-NEXT:    not.b32 %r3, %r19;
+; CHECKPTX71-NEXT:    ld.b32 %r46, [%r1];
 ; CHECKPTX71-NEXT:  $L__BB0_1: // %atomicrmw.start45
 ; CHECKPTX71-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT:    shr.u32 %r28, %r54, %r2;
-; CHECKPTX71-NEXT:    cvt.u16.u32 %rs2, %r28;
+; CHECKPTX71-NEXT:    shr.u32 %r20, %r46, %r2;
+; CHECKPTX71-NEXT:    cvt.u16.u32 %rs2, %r20;
 ; CHECKPTX71-NEXT:    mov.b16 %rs3, 0x3F80;
 ; CHECKPTX71-NEXT:    fma.rn.bf16 %rs4, %rs2, %rs3, %rs1;
-; CHECKPTX71-NEXT:    cvt.u32.u16 %r29, %rs4;
-; CHECKPTX71-NEXT:    shl.b32 %r30, %r29, %r2;
-; CHECKPTX71-NEXT:    and.b32 %r31, %r54, %r3;
-; CHECKPTX71-NEXT:    or.b32 %r32, %r31, %r30;
-; CHECKPTX71-NEXT:    atom.relaxed.sys.cas.b32 %r6, [%r1], %r54, %r32;
-; CHECKPTX71-NEXT:    setp.ne.b32 %p1, %r6, %r54;
-; CHECKPTX71-NEXT:    mov.b32 %r54, %r6;
+; CHECKPTX71-NEXT:    cvt.u32.u16 %r21, %rs4;
+; CHECKPTX71-NEXT:    shl.b32 %r22, %r21, %r2;
+; CHECKPTX71-NEXT:    and.b32 %r23, %r46, %r3;
+; CHECKPTX71-NEXT:    or.b32 %r24, %r23, %r22;
+; CHECKPTX71-NEXT:    atom.relaxed.sys.cas.b32 %r4, [%r1], %r46, %r24;
+; CHECKPTX71-NEXT:    setp.ne.b32 %p1, %r4, %r46;
+; CHECKPTX71-NEXT:    mov.b32 %r46, %r4;
 ; CHECKPTX71-NEXT:    @%p1 bra $L__BB0_1;
 ; CHECKPTX71-NEXT:  // %bb.2: // %atomicrmw.end44
-; CHECKPTX71-NEXT:    ld.b32 %r55, [%r1];
+; CHECKPTX71-NEXT:    ld.b32 %r47, [%r1];
 ; CHECKPTX71-NEXT:  $L__BB0_3: // %atomicrmw.start27
 ; CHECKPTX71-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT:    shr.u32 %r33, %r55, %r2;
-; CHECKPTX71-NEXT:    cvt.u16.u32 %rs5, %r33;
+; CHECKPTX71-NEXT:    shr.u32 %r25, %r47, %r2;
+; CHECKPTX71-NEXT:    cvt.u16.u32 %rs5, %r25;
 ; CHECKPTX71-NEXT:    mov.b16 %rs6, 0x3F80;
 ; CHECKPTX71-NEXT:    fma.rn.bf16 %rs7, %rs5, %rs6, %rs6;
-; CHECKPTX71-NEXT:    cvt.u32.u16 %r34, %rs7;
-; CHECKPTX71-NEXT:    shl.b32 %r35, %r34, %r2;
-; CHECKPTX71-NEXT:    and.b32 %r36, %r55, %r3;
-; CHECKPTX71-NEXT:    or.b32 %r37, %r36, %r35;
-; CHECKPTX71-NEXT:    atom.relaxed.sys.cas.b32 %r9, [%r1], %r55, %r37;
-; CHECKPTX71-NEXT:    setp.ne.b32 %p2, %r9, %r55;
-; CHECKPTX71-NEXT:    mov.b32 %r55, %r9;
+; CHECKPTX71-NEXT:    cvt.u32.u16 %r26, %rs7;
+; CHECKPTX71-NEXT:    shl.b32 %r27, %r26, %r2;
+; CHECKPTX71-NEXT:    and.b32 %r28, %r47, %r3;
+; CHECKPTX71-NEXT:    or.b32 %r29, %r28, %r27;
+; CHECKPTX71-NEXT:    atom.relaxed.sys.cas.b32 %r5, [%r1], %r47, %r29;
+; CHECKPTX71-NEXT:    setp.ne.b32 %p2, %r5, %r47;
+; CHECKPTX71-NEXT:    mov.b32 %r47, %r5;
 ; CHECKPTX71-NEXT:    @%p2 bra $L__BB0_3;
 ; CHECKPTX71-NEXT:  // %bb.4: // %atomicrmw.end26
-; CHECKPTX71-NEXT:    and.b32 %r10, %r22, -4;
-; CHECKPTX71-NEXT:    shl.b32 %r38, %r22, 3;
-; CHECKPTX71-NEXT:    and.b32 %r11, %r38, 24;
-; CHECKPTX71-NEXT:    mov.b32 %r39, 65535;
-; CHECKPTX71-NEXT:    shl.b32 %r40, %r39, %r11;
-; CHECKPTX71-NEXT:    not.b32 %r12, %r40;
-; CHECKPTX71-NEXT:    ld.global.b32 %r56, [%r10];
+; CHECKPTX71-NEXT:    and.b32 %r6, %r14, -4;
+; CHECKPTX71-NEXT:    shl.b32 %r30, %r14, 3;
+; CHECKPTX71-NEXT:    and.b32 %r7, %r30, 24;
+; CHECKPTX71-NEXT:    mov.b32 %r31, 65535;
+; CHECKPTX71-NEXT:    shl.b32 %r32, %r31, %r7;
+; CHECKPTX71-NEXT:    not.b32 %r8, %r32;
+; CHECKPTX71-NEXT:    ld.global.b32 %r48, [%r6];
 ; CHECKPTX71-NEXT:  $L__BB0_5: // %atomicrmw.start9
 ; CHECKPTX71-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT:    shr.u32 %r41, %r56, %r11;
-; CHECKPTX71-NEXT:    cvt.u16.u32 %rs8, %r41;
+; CHECKPTX71-NEXT:    shr.u32 %r33, %r48, %r7;
+; CHECKPTX71-NEXT:    cvt.u16.u32 %rs8, %r33;
 ; CHECKPTX71-NEXT:    mov.b16 %rs9, 0x3F80;
 ; CHECKPTX71-NEXT:    fma.rn.bf16 %rs10, %rs8, %rs9, %rs1;
-; CHECKPTX71-NEXT:    cvt.u32.u16 %r42, %rs10;
-; CHECKPTX71-NEXT:    shl.b32 %r43, %r42, %r11;
-; CHECKPTX71-NEXT:    and.b32 %r44, %r56, %r12;
-; CHECKPTX71-NEXT:    or.b32 %r45, %r44, %r43;
-; CHECKPTX71-NEXT:    atom.relaxed.sys.global.cas.b32 %r15, [%r10], %r56, %r45;
-; CHECKPTX71-NEXT:    setp.ne.b32 %p3, %r15, %r56;
-; CHECKPTX71-NEXT:    mov.b32 %r56, %r15;
+; CHECKPTX71-NEXT:    cvt.u32.u16 %r34, %rs10;
+; CHECKPTX71-NEXT:    shl.b32 %r35, %r34, %r7;
+; CHECKPTX71-NEXT:    and.b32 %r36, %r48, %r8;
+; CHECKPTX71-NEXT:    or.b32 %r37, %r36, %r35;
+; CHECKPTX71-NEXT:    atom.relaxed.sys.global.cas.b32 %r9, [%r6], %r48, %r37;
+; CHECKPTX71-NEXT:    setp.ne.b32 %p3, %r9, %r48;
+; CHECKPTX71-NEXT:    mov.b32 %r48, %r9;
 ; CHECKPTX71-NEXT:    @%p3 bra $L__BB0_5;
 ; CHECKPTX71-NEXT:  // %bb.6: // %atomicrmw.end8
-; CHECKPTX71-NEXT:    and.b32 %r16, %r23, -4;
-; CHECKPTX71-NEXT:    shl.b32 %r46, %r23, 3;
-; CHECKPTX71-NEXT:    and.b32 %r17, %r46, 24;
-; CHECKPTX71-NEXT:    mov.b32 %r47, 65535;
-; CHECKPTX71-NEXT:    shl.b32 %r48, %r47, %r17;
-; CHECKPTX71-NEXT:    not.b32 %r18, %r48;
-; CHECKPTX71-NEXT:    ld.shared.b32 %r57, [%r16];
+; CHECKPTX71-NEXT:    and.b32 %r10, %r15, -4;
+; CHECKPTX71-NEXT:    shl.b32 %r38, %r15, 3;
+; CHECKPTX71-NEXT:    and.b32 %r11, %r38, 24;
+; CHECKPTX71-NEXT:    mov.b32 %r39, 65535;
+; CHECKPTX71-NEXT:    shl.b32 %r40, %r39, %r11;
+; CHECKPTX71-NEXT:    not.b32 %r12, %r40;
+; CHECKPTX71-NEXT:    ld.shared.b32 %r49, [%r10];
 ; CHECKPTX71-NEXT:  $L__BB0_7: // %atomicrmw.start
 ; CHECKPTX71-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT:    shr.u32 %r49, %r57, %r17;
-; CHECKPTX71-NEXT:    cvt.u16.u32 %rs11, %r49;
+; CHECKPTX71-NEXT:    shr.u32 %r41, %r49, %r11;
+; CHECKPTX71-NEXT:    cvt.u16.u32 %rs11, %r41;
 ; CHECKPTX71-NEXT:    mov.b16 %rs12, 0x3F80;
 ; CHECKPTX71-NEXT:    fma.rn.bf16 %rs13, %rs11, %rs12, %rs1;
-; CHECKPTX71-NEXT:    cvt.u32.u16 %r50, %rs13;
-; CHECKPTX71-NEXT:    shl.b32 %r51, %r50, %r17;
-; CHECKPTX71-NEXT:    and.b32 %r52, %r57, %r18;
-; CHECKPTX71-NEXT:    or.b32 %r53, %r52, %r51;
-; CHECKPTX71-NEXT:    atom.relaxed.sys.shared.cas.b32 %r21, [%r16], %r57, %r53;
-; CHECKPTX71-NEXT:    setp.ne.b32 %p4, %r21, %r57;
-; CHECKPTX71-NEXT:    mov.b32 %r57, %r21;
+; CHECKPTX71-NEXT:    cvt.u32.u16 %r42, %rs13;
+; CHECKPTX71-NEXT:    shl.b32 %r43, %r42, %r11;
+; CHECKPTX71-NEXT:    and.b32 %r44, %r49, %r12;
+; CHECKPTX71-NEXT:    or.b32 %r45, %r44, %r43;
+; CHECKPTX71-NEXT:    atom.relaxed.sys.shared.cas.b32 %r13, [%r10], %r49, %r45;
+; CHECKPTX71-NEXT:    setp.ne.b32 %p4, %r13, %r49;
+; CHECKPTX71-NEXT:    mov.b32 %r49, %r13;
 ; CHECKPTX71-NEXT:    @%p4 bra $L__BB0_7;
 ; CHECKPTX71-NEXT:  // %bb.8: // %atomicrmw.end
 ; CHECKPTX71-NEXT:    ret;
diff --git a/llvm/test/CodeGen/NVPTX/atomics.ll b/llvm/test/CodeGen/NVPTX/atomics.ll
index 04a58cf22cfc5..6ea02f35e9626 100644
--- a/llvm/test/CodeGen/NVPTX/atomics.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics.ll
@@ -425,40 +425,40 @@ define half @atomicrmw_add_f16_generic(ptr %addr, half %val) {
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<2>;
 ; CHECK-NEXT:    .reg .b16 %rs<4>;
-; CHECK-NEXT:    .reg .b32 %r<20>;
+; CHECK-NEXT:    .reg .b32 %r<18>;
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b16 %rs1, [atomicrmw_add_f16_generic_param_1];
 ; CHECK-NEXT:    ld.param.b64 %rd2, [atomicrmw_add_f16_generic_param_0];
 ; CHECK-NEXT:    and.b64 %rd1, %rd2, -4;
-; CHECK-NEXT:    cvt.u32.u64 %r6, %rd2;
-; CHECK-NEXT:    and.b32 %r7, %r6, 3;
-; CHECK-NEXT:    shl.b32 %r1, %r7, 3;
-; CHECK-NEXT:    mov.b32 %r8, 65535;
-; CHECK-NEXT:    shl.b32 %r9, %r8, %r1;
-; CHECK-NEXT:    not.b32 %r2, %r9;
-; CHECK-NEXT:    ld.b32 %r19, [%rd1];
-; CHECK-NEXT:    cvt.f32.f16 %r12, %rs1;
+; CHECK-NEXT:    cvt.u32.u64 %r4, %rd2;
+; CHECK-NEXT:    and.b32 %r5, %r4, 3;
+; CHECK-NEXT:    shl.b32 %r1, %r5, 3;
+; CHECK-NEXT:    mov.b32 %r6, 65535;
+; CHECK-NEXT:    shl.b32 %r7, %r6, %r1;
+; CHECK-NEXT:    not.b32 %r2, %r7;
+; CHECK-NEXT:    ld.b32 %r17, [%rd1];
+; CHECK-NEXT:    cvt.f32.f16 %r10, %rs1;
 ; CHECK-NEXT:  $L__BB24_1: // %atomicrmw.start
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    shr.u32 %r10, %r19, %r1;
-; CHECK-NEXT:    cvt.u16.u32 %rs2, %r10;
-; CHECK-NEXT:    cvt.f32.f16 %r11, %rs2;
-; CHECK-NEXT:    add.rn.f32 %r13, %r11, %r12;
-; CHECK-NEXT:    cvt.rn.f16.f32 %rs3, %r13;
-; CHECK-NEXT:    cvt.u32.u16 %r14, %rs3;
-; CHECK-NEXT:    shl.b32 %r15, %r14, %r1;
-; CHECK-NEXT:    and.b32 %r16, %r19, %r2;
-; CHECK-NEXT:    or.b32 %r17, %r16, %r15;
+; CHECK-NEXT:    shr.u32 %r8, %r17, %r1;
+; CHECK-NEXT:    cvt.u16.u32 %rs2, %r8;
+; CHECK-NEXT:    cvt.f32.f16 %r9, %rs2;
+; CHECK-NEXT:    add.rn.f32 %r11, %r9, %r10;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs3, %r11;
+; CHECK-NEXT:    cvt.u32.u16 %r12, %rs3;
+; CHECK-NEXT:    shl.b32 %r13, %r12, %r1;
+; CHECK-NEXT:    and.b32 %r14, %r17, %r2;
+; CHECK-NEXT:    or.b32 %r15, %r14, %r13;
 ; CHECK-NEXT:    membar.sys;
-; CHECK-NEXT:    atom.cas.b32 %r5, [%rd1], %r19, %r17;
-; CHECK-NEXT:    setp.ne.b32 %p1, %r5, %r19;
-; CHECK-NEXT:    mov.b32 %r19, %r5;
+; CHECK-NEXT:    atom.cas.b32 %r3, [%rd1], %r17, %r15;
+; CHECK-NEXT:    setp.ne.b32 %p1, %r3, %r17;
+; CHECK-NEXT:    mov.b32 %r17, %r3;
 ; CHECK-NEXT:    @%p1 bra $L__BB24_1;
 ; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
-; CHECK-NEXT:    shr.u32 %r18, %r5, %r1;
-; CHECK-NEXT:    st.param.b16 [func_retval0], %r18;
+; CHECK-NEXT:    shr.u32 %r16, %r3, %r1;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %r16;
 ; CHECK-NEXT:    ret;
   %ret = atomicrmw fadd ptr %addr, half %val seq_cst
   ret half %ret
diff --git a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
index b4641d01eb927..bd4c7775354ae 100644
--- a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
@@ -204,7 +204,7 @@ declare <2 x bfloat> @test_callee(<2 x bfloat> %a, <2 x bfloat> %b) #0
 define <2 x bfloat> @test_call(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_call(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_call_param_0];
diff --git a/llvm/test/CodeGen/NVPTX/byval-arg-vectorize.ll b/llvm/test/CodeGen/NVPTX/byval-arg-vectorize.ll
index 9988d5b122cc1..579f02a9539c6 100644
--- a/llvm/test/CodeGen/NVPTX/byval-arg-vectorize.ll
+++ b/llvm/test/CodeGen/NVPTX/byval-arg-vectorize.ll
@@ -11,7 +11,7 @@ declare %struct.double2 @add(ptr align(16) byval(%struct.double2), ptr align(16)
 define void @call_byval(ptr %out, ptr %in1, ptr %in2) {
 ; CHECK-LABEL: call_byval(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<12>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [call_byval_param_0];
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll
index 63c389c36e87e..6e480996e7e6a 100644
--- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll
@@ -7,41 +7,41 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b32 %r<18>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
 ; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cta_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cta_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r9, [monotonic_monotonic_i8_global_cta_param_1];
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 255;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    and.b32 %r15, %r14, 255;
-; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:    ld.param.b8 %r7, [monotonic_monotonic_i8_global_cta_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 255;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    and.b32 %r13, %r12, 255;
+; SM60-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM60-NEXT:    ld.global.b32 %r14, [%rd1];
+; SM60-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM60-NEXT:  $L__BB0_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r17, %r20, %r3;
-; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM60-NEXT:    or.b32 %r15, %r17, %r3;
+; SM60-NEXT:    or.b32 %r16, %r17, %r4;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM60-NEXT:    @%p1 bra $L__BB0_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB0_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    and.b32 %r6, %r5, %r2;
+; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM60-NEXT:    mov.b32 %r17, %r6;
 ; SM60-NEXT:    @%p2 bra $L__BB0_1;
 ; SM60-NEXT:  $L__BB0_3: // %partword.cmpxchg.end
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic
     ret i8 %new
@@ -52,42 +52,42 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b32 %r<18>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
 ; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_global_cta_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_global_cta_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r9, [monotonic_acquire_i8_global_cta_param_1];
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 255;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    and.b32 %r15, %r14, 255;
-; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:    ld.param.b8 %r7, [monotonic_acquire_i8_global_cta_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 255;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    and.b32 %r13, %r12, 255;
+; SM60-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM60-NEXT:    ld.global.b32 %r14, [%rd1];
+; SM60-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM60-NEXT:  $L__BB1_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r17, %r20, %r3;
-; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM60-NEXT:    or.b32 %r15, %r17, %r3;
+; SM60-NEXT:    or.b32 %r16, %r17, %r4;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM60-NEXT:    @%p1 bra $L__BB1_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB1_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    and.b32 %r6, %r5, %r2;
+; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM60-NEXT:    mov.b32 %r17, %r6;
 ; SM60-NEXT:    @%p2 bra $L__BB1_1;
 ; SM60-NEXT:  $L__BB1_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire
     ret i8 %new
@@ -98,43 +98,43 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b32 %r<18>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
 ; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b8 %r9, [monotonic_seq_cst_i8_global_cta_param_1];
+; SM60-NEXT:    ld.param.b8 %r7, [monotonic_seq_cst_i8_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 255;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    and.b32 %r15, %r14, 255;
-; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 255;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    and.b32 %r13, %r12, 255;
+; SM60-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM60-NEXT:    ld.global.b32 %r14, [%rd1];
+; SM60-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM60-NEXT:  $L__BB2_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r17, %r20, %r3;
-; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM60-NEXT:    or.b32 %r15, %r17, %r3;
+; SM60-NEXT:    or.b32 %r16, %r17, %r4;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM60-NEXT:    @%p1 bra $L__BB2_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB2_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    and.b32 %r6, %r5, %r2;
+; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM60-NEXT:    mov.b32 %r17, %r6;
 ; SM60-NEXT:    @%p2 bra $L__BB2_1;
 ; SM60-NEXT:  $L__BB2_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst
     ret i8 %new
@@ -145,42 +145,42 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b32 %r<18>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
 ; SM60-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_global_cta_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_global_cta_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r9, [acquire_monotonic_i8_global_cta_param_1];
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 255;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    and.b32 %r15, %r14, 255;
-; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:    ld.param.b8 %r7, [acquire_monotonic_i8_global_cta_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 255;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    and.b32 %r13, %r12, 255;
+; SM60-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM60-NEXT:    ld.global.b32 %r14, [%rd1];
+; SM60-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM60-NEXT:  $L__BB3_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r17, %r20, %r3;
-; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM60-NEXT:    or.b32 %r15, %r17, %r3;
+; SM60-NEXT:    or.b32 %r16, %r17, %r4;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM60-NEXT:    @%p1 bra $L__BB3_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB3_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    and.b32 %r6, %r5, %r2;
+; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM60-NEXT:    mov.b32 %r17, %r6;
 ; SM60-NEXT:    @%p2 bra $L__BB3_1;
 ; SM60-NEXT:  $L__BB3_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic
     ret i8 %new
@@ -191,42 +191,42 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b32 %r<18>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
 ; SM60-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_global_cta_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_global_cta_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r9, [acquire_acquire_i8_global_cta_param_1];
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 255;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    and.b32 %r15, %r14, 255;
-; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:    ld.param.b8 %r7, [acquire_acquire_i8_global_cta_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 255;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    and.b32 %r13, %r12, 255;
+; SM60-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM60-NEXT:    ld.global.b32 %r14, [%rd1];
+; SM60-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM60-NEXT:  $L__BB4_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r17, %r20, %r3;
-; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM60-NEXT:    or.b32 %r15, %r17, %r3;
+; SM60-NEXT:    or.b32 %r16, %r17, %r4;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM60-NEXT:    @%p1 bra $L__BB4_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB4_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    and.b32 %r6, %r5, %r2;
+; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM60-NEXT:    mov.b32 %r17, %r6;
 ; SM60-NEXT:    @%p2 bra $L__BB4_1;
 ; SM60-NEXT:  $L__BB4_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire
     ret i8 %new
@@ -237,43 +237,43 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b32 %r<18>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
 ; SM60-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cta_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b8 %r9, [acquire_seq_cst_i8_global_cta_param_1];
+; SM60-NEXT:    ld.param.b8 %r7, [acquire_seq_cst_i8_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 255;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    and.b32 %r15, %r14, 255;
-; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 255;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    and.b32 %r13, %r12, 255;
+; SM60-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM60-NEXT:    ld.global.b32 %r14, [%rd1];
+; SM60-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM60-NEXT:  $L__BB5_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r17, %r20, %r3;
-; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM60-NEXT:    or.b32 %r15, %r17, %r3;
+; SM60-NEXT:    or.b32 %r16, %r17, %r4;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM60-NEXT:    @%p1 bra $L__BB5_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB5_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    and.b32 %r6, %r5, %r2;
+; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM60-NEXT:    mov.b32 %r17, %r6;
 ; SM60-NEXT:    @%p2 bra $L__BB5_1;
 ; SM60-NEXT:  $L__BB5_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst
     ret i8 %new
@@ -284,42 +284,42 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b32 %r<18>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
 ; SM60-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_global_cta_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b8 %r9, [release_monotonic_i8_global_cta_param_1];
+; SM60-NEXT:    ld.param.b8 %r7, [release_monotonic_i8_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 255;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    and.b32 %r15, %r14, 255;
-; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 255;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    and.b32 %r13, %r12, 255;
+; SM60-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM60-NEXT:    ld.global.b32 %r14, [%rd1];
+; SM60-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM60-NEXT:  $L__BB6_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r17, %r20, %r3;
-; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM60-NEXT:    or.b32 %r15, %r17, %r3;
+; SM60-NEXT:    or.b32 %r16, %r17, %r4;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM60-NEXT:    @%p1 bra $L__BB6_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB6_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    and.b32 %r6, %r5, %r2;
+; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM60-NEXT:    mov.b32 %r17, %r6;
 ; SM60-NEXT:    @%p2 bra $L__BB6_1;
 ; SM60-NEXT:  $L__BB6_3: // %partword.cmpxchg.end
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic
     ret i8 %new
@@ -330,43 +330,43 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b32 %r<18>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
 ; SM60-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_global_cta_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b8 %r9, [release_acquire_i8_global_cta_param_1];
+; SM60-NEXT:    ld.param.b8 %r7, [release_acquire_i8_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 255;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    and.b32 %r15, %r14, 255;
-; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 255;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    and.b32 %r13, %r12, 255;
+; SM60-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM60-NEXT:    ld.global.b32 %r14, [%rd1];
+; SM60-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM60-NEXT:  $L__BB7_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r17, %r20, %r3;
-; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM60-NEXT:    or.b32 %r15, %r17, %r3;
+; SM60-NEXT:    or.b32 %r16, %r17, %r4;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM60-NEXT:    @%p1 bra $L__BB7_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB7_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    and.b32 %r6, %r5, %r2;
+; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM60-NEXT:    mov.b32 %r17, %r6;
 ; SM60-NEXT:    @%p2 bra $L__BB7_1;
 ; SM60-NEXT:  $L__BB7_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire
     ret i8 %new
@@ -377,43 +377,43 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b32 %r<18>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
 ; SM60-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_global_cta_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b8 %r9, [release_seq_cst_i8_global_cta_param_1];
+; SM60-NEXT:    ld.param.b8 %r7, [release_seq_cst_i8_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 255;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    and.b32 %r15, %r14, 255;
-; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 255;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    and.b32 %r13, %r12, 255;
+; SM60-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM60-NEXT:    ld.global.b32 %r14, [%rd1];
+; SM60-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM60-NEXT:  $L__BB8_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r17, %r20, %r3;
-; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM60-NEXT:    or.b32 %r15, %r17, %r3;
+; SM60-NEXT:    or.b32 %r16, %r17, %r4;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM60-NEXT:    @%p1 bra $L__BB8_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB8_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    and.b32 %r6, %r5, %r2;
+; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM60-NEXT:    mov.b32 %r17, %r6;
 ; SM60-NEXT:    @%p2 bra $L__BB8_1;
 ; SM60-NEXT:  $L__BB8_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst
     ret i8 %new
@@ -424,43 +424,43 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b32 %r<18>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
 ; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b8 %r9, [acq_rel_monotonic_i8_global_cta_param_1];
+; SM60-NEXT:    ld.param.b8 %r7, [acq_rel_monotonic_i8_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 255;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    and.b32 %r15, %r14, 255;
-; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 255;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    and.b32 %r13, %r12, 255;
+; SM60-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM60-NEXT:    ld.global.b32 %r14, [%rd1];
+; SM60-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM60-NEXT:  $L__BB9_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r17, %r20, %r3;
-; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM60-NEXT:    or.b32 %r15, %r17, %r3;
+; SM60-NEXT:    or.b32 %r16, %r17, %r4;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM60-NEXT:    @%p1 bra $L__BB9_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB9_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    and.b32 %r6, %r5, %r2;
+; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM60-NEXT:    mov.b32 %r17, %r6;
 ; SM60-NEXT:    @%p2 bra $L__BB9_1;
 ; SM60-NEXT:  $L__BB9_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic
     ret i8 %new
@@ -471,43 +471,43 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b32 %r<18>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
 ; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cta_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b8 %r9, [acq_rel_acquire_i8_global_cta_param_1];
+; SM60-NEXT:    ld.param.b8 %r7, [acq_rel_acquire_i8_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 255;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    and.b32 %r15, %r14, 255;
-; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 255;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    and.b32 %r13, %r12, 255;
+; SM60-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM60-NEXT:    ld.global.b32 %r14, [%rd1];
+; SM60-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM60-NEXT:  $L__BB10_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r17, %r20, %r3;
-; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM60-NEXT:    or.b32 %r15, %r17, %r3;
+; SM60-NEXT:    or.b32 %r16, %r17, %r4;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM60-NEXT:    @%p1 bra $L__BB10_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB10_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    and.b32 %r6, %r5, %r2;
+; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM60-NEXT:    mov.b32 %r17, %r6;
 ; SM60-NEXT:    @%p2 bra $L__BB10_1;
 ; SM60-NEXT:  $L__BB10_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
     ret i8 %new
@@ -518,43 +518,43 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b32 %r<18>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
 ; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_cta_param_1];
+; SM60-NEXT:    ld.param.b8 %r7, [acq_rel_seq_cst_i8_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 255;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    and.b32 %r15, %r14, 255;
-; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 255;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    and.b32 %r13, %r12, 255;
+; SM60-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM60-NEXT:    ld.global.b32 %r14, [%rd1];
+; SM60-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM60-NEXT:  $L__BB11_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r17, %r20, %r3;
-; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM60-NEXT:    or.b32 %r15, %r17, %r3;
+; SM60-NEXT:    or.b32 %r16, %r17, %r4;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM60-NEXT:    @%p1 bra $L__BB11_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB11_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    and.b32 %r6, %r5, %r2;
+; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM60-NEXT:    mov.b32 %r17, %r6;
 ; SM60-NEXT:    @%p2 bra $L__BB11_1;
 ; SM60-NEXT:  $L__BB11_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst
     ret i8 %new
@@ -565,43 +565,43 @@ define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b32 %r<18>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
 ; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b8 %r9, [seq_cst_monotonic_i8_global_cta_param_1];
+; SM60-NEXT:    ld.param.b8 %r7, [seq_cst_monotonic_i8_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 255;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    and.b32 %r15, %r14, 255;
-; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 255;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    and.b32 %r13, %r12, 255;
+; SM60-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM60-NEXT:    ld.global.b32 %r14, [%rd1];
+; SM60-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM60-NEXT:  $L__BB12_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r17, %r20, %r3;
-; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM60-NEXT:    or.b32 %r15, %r17, %r3;
+; SM60-NEXT:    or.b32 %r16, %r17, %r4;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM60-NEXT:    @%p1 bra $L__BB12_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB12_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    and.b32 %r6, %r5, %r2;
+; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM60-NEXT:    mov.b32 %r17, %r6;
 ; SM60-NEXT:    @%p2 bra $L__BB12_1;
 ; SM60-NEXT:  $L__BB12_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic
     ret i8 %new
@@ -612,43 +612,43 @@ define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b32 %r<18>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
 ; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cta_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b8 %r9, [seq_cst_acquire_i8_global_cta_param_1];
+; SM60-NEXT:    ld.param.b8 %r7, [seq_cst_acquire_i8_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 255;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    and.b32 %r15, %r14, 255;
-; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 255;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    and.b32 %r13, %r12, 255;
+; SM60-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM60-NEXT:    ld.global.b32 %r14, [%rd1];
+; SM60-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM60-NEXT:  $L__BB13_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r17, %r20, %r3;
-; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM60-NEXT:    or.b32 %r15, %r17, %r3;
+; SM60-NEXT:    or.b32 %r16, %r17, %r4;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM60-NEXT:    @%p1 bra $L__BB13_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB13_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    and.b32 %r6, %r5, %r2;
+; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM60-NEXT:    mov.b32 %r17, %r6;
 ; SM60-NEXT:    @%p2 bra $L__BB13_1;
 ; SM60-NEXT:  $L__BB13_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire
     ret i8 %new
@@ -659,43 +659,43 @@ define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b32 %r<18>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
 ; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_cta_param_1];
+; SM60-NEXT:    ld.param.b8 %r7, [seq_cst_seq_cst_i8_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 255;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    and.b32 %r15, %r14, 255;
-; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 255;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    and.b32 %r13, %r12, 255;
+; SM60-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM60-NEXT:    ld.global.b32 %r14, [%rd1];
+; SM60-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM60-NEXT:  $L__BB14_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r17, %r20, %r3;
-; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM60-NEXT:    or.b32 %r15, %r17, %r3;
+; SM60-NEXT:    or.b32 %r16, %r17, %r4;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM60-NEXT:    @%p1 bra $L__BB14_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB14_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    and.b32 %r6, %r5, %r2;
+; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM60-NEXT:    mov.b32 %r17, %r6;
 ; SM60-NEXT:    @%p2 bra $L__BB14_1;
 ; SM60-NEXT:  $L__BB14_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst
     ret i8 %new
@@ -706,40 +706,40 @@ define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp,
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
 ; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_global_cta_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_global_cta_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_global_cta_param_1];
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    ld.param.b16 %r7, [monotonic_monotonic_i16_global_cta_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 65535;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM60-NEXT:  $L__BB15_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM60-NEXT:    @%p1 bra $L__BB15_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB15_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    and.b32 %r6, %r5, %r2;
+; SM60-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM60-NEXT:    mov.b32 %r16, %r6;
 ; SM60-NEXT:    @%p2 bra $L__BB15_1;
 ; SM60-NEXT:  $L__BB15_3: // %partword.cmpxchg.end
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic
     ret i16 %new
@@ -750,41 +750,41 @@ define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
 ; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_global_cta_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_global_cta_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_global_cta_param_1];
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    ld.param.b16 %r7, [monotonic_acquire_i16_global_cta_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 65535;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM60-NEXT:  $L__BB16_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM60-NEXT:    @%p1 bra $L__BB16_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB16_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    and.b32 %r6, %r5, %r2;
+; SM60-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM60-NEXT:    mov.b32 %r16, %r6;
 ; SM60-NEXT:    @%p2 bra $L__BB16_1;
 ; SM60-NEXT:  $L__BB16_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire
     ret i16 %new
@@ -795,42 +795,42 @@ define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
 ; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_global_cta_param_1];
+; SM60-NEXT:    ld.param.b16 %r7, [monotonic_seq_cst_i16_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 65535;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM60-NEXT:  $L__BB17_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM60-NEXT:    @%p1 bra $L__BB17_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB17_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    and.b32 %r6, %r5, %r2;
+; SM60-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM60-NEXT:    mov.b32 %r16, %r6;
 ; SM60-NEXT:    @%p2 bra $L__BB17_1;
 ; SM60-NEXT:  $L__BB17_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst
     ret i16 %new
@@ -841,41 +841,41 @@ define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
 ; SM60-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_global_cta_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_global_cta_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_global_cta_param_1];
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    ld.param.b16 %r7, [acquire_monotonic_i16_global_cta_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 65535;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM60-NEXT:  $L__BB18_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM60-NEXT:    @%p1 bra $L__BB18_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB18_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    and.b32 %r6, %r5, %r2;
+; SM60-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM60-NEXT:    mov.b32 %r16, %r6;
 ; SM60-NEXT:    @%p2 bra $L__BB18_1;
 ; SM60-NEXT:  $L__BB18_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic
     ret i16 %new
@@ -886,41 +886,41 @@ define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
 ; SM60-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_global_cta_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_global_cta_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_global_cta_param_1];
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    ld.param.b16 %r7, [acquire_acquire_i16_global_cta_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 65535;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM60-NEXT:  $L__BB19_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM60-NEXT:    @%p1 bra $L__BB19_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB19_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    and.b32 %r6, %r5, %r2;
+; SM60-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM60-NEXT:    mov.b32 %r16, %r6;
 ; SM60-NEXT:    @%p2 bra $L__BB19_1;
 ; SM60-NEXT:  $L__BB19_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire
     ret i16 %new
@@ -931,42 +931,42 @@ define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
 ; SM60-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_global_cta_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_global_cta_param_1];
+; SM60-NEXT:    ld.param.b16 %r7, [acquire_seq_cst_i16_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 65535;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM60-NEXT:  $L__BB20_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM60-NEXT:    @%p1 bra $L__BB20_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB20_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    and.b32 %r6, %r5, %r2;
+; SM60-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM60-NEXT:    mov.b32 %r16, %r6;
 ; SM60-NEXT:    @%p2 bra $L__BB20_1;
 ; SM60-NEXT:  $L__BB20_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst
     ret i16 %new
@@ -977,41 +977,41 @@ define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
 ; SM60-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_global_cta_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_global_cta_param_1];
+; SM60-NEXT:    ld.param.b16 %r7, [release_monotonic_i16_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 65535;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM60-NEXT:  $L__BB21_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM60-NEXT:    @%p1 bra $L__BB21_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB21_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    and.b32 %r6, %r5, %r2;
+; SM60-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM60-NEXT:    mov.b32 %r16, %r6;
 ; SM60-NEXT:    @%p2 bra $L__BB21_1;
 ; SM60-NEXT:  $L__BB21_3: // %partword.cmpxchg.end
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic
     ret i16 %new
@@ -1022,42 +1022,42 @@ define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
 ; SM60-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_global_cta_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b16 %r9, [release_acquire_i16_global_cta_param_1];
+; SM60-NEXT:    ld.param.b16 %r7, [release_acquire_i16_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 65535;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM60-NEXT:  $L__BB22_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM60-NEXT:    @%p1 bra $L__BB22_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB22_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    and.b32 %r6, %r5, %r2;
+; SM60-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM60-NEXT:    mov.b32 %r16, %r6;
 ; SM60-NEXT:    @%p2 bra $L__BB22_1;
 ; SM60-NEXT:  $L__BB22_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release acquire
     ret i16 %new
@@ -1068,42 +1068,42 @@ define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
 ; SM60-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_global_cta_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_global_cta_param_1];
+; SM60-NEXT:    ld.param.b16 %r7, [release_seq_cst_i16_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 65535;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM60-NEXT:  $L__BB23_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM60-NEXT:    @%p1 bra $L__BB23_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB23_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    and.b32 %r6, %r5, %r2;
+; SM60-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM60-NEXT:    mov.b32 %r16, %r6;
 ; SM60-NEXT:    @%p2 bra $L__BB23_1;
 ; SM60-NEXT:  $L__BB23_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst
     ret i16 %new
@@ -1114,42 +1114,42 @@ define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
 ; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_global_cta_param_1];
+; SM60-NEXT:    ld.param.b16 %r7, [acq_rel_monotonic_i16_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 65535;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM60-NEXT:  $L__BB24_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM60-NEXT:    @%p1 bra $L__BB24_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB24_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    and.b32 %r6, %r5, %r2;
+; SM60-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM60-NEXT:    mov.b32 %r16, %r6;
 ; SM60-NEXT:    @%p2 bra $L__BB24_1;
 ; SM60-NEXT:  $L__BB24_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic
     ret i16 %new
@@ -1160,42 +1160,42 @@ define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
 ; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_global_cta_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_global_cta_param_1];
+; SM60-NEXT:    ld.param.b16 %r7, [acq_rel_acquire_i16_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 65535;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM60-NEXT:  $L__BB25_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM60-NEXT:    @%p1 bra $L__BB25_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB25_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    and.b32 %r6, %r5, %r2;
+; SM60-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM60-NEXT:    mov.b32 %r16, %r6;
 ; SM60-NEXT:    @%p2 bra $L__BB25_1;
 ; SM60-NEXT:  $L__BB25_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire
     ret i16 %new
@@ -1206,42 +1206,42 @@ define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
 ; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_cta_param_1];
+; SM60-NEXT:    ld.param.b16 %r7, [acq_rel_seq_cst_i16_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 65535;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM60-NEXT:  $L__BB26_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM60-NEXT:    @%p1 bra $L__BB26_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB26_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    and.b32 %r6, %r5, %r2;
+; SM60-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM60-NEXT:    mov.b32 %r16, %r6;
 ; SM60-NEXT:    @%p2 bra $L__BB26_1;
 ; SM60-NEXT:  $L__BB26_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst
     ret i16 %new
@@ -1252,42 +1252,42 @@ define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
 ; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_global_cta_param_1];
+; SM60-NEXT:    ld.param.b16 %r7, [seq_cst_monotonic_i16_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 65535;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM60-NEXT:  $L__BB27_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM60-NEXT:    @%p1 bra $L__BB27_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB27_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    and.b32 %r6, %r5, %r2;
+; SM60-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM60-NEXT:    mov.b32 %r16, %r6;
 ; SM60-NEXT:    @%p2 bra $L__BB27_1;
 ; SM60-NEXT:  $L__BB27_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic
     ret i16 %new
@@ -1298,42 +1298,42 @@ define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
 ; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_global_cta_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_global_cta_param_1];
+; SM60-NEXT:    ld.param.b16 %r7, [seq_cst_acquire_i16_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 65535;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM60-NEXT:  $L__BB28_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM60-NEXT:    @%p1 bra $L__BB28_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB28_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    and.b32 %r6, %r5, %r2;
+; SM60-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM60-NEXT:    mov.b32 %r16, %r6;
 ; SM60-NEXT:    @%p2 bra $L__BB28_1;
 ; SM60-NEXT:  $L__BB28_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire
     ret i16 %new
@@ -1344,42 +1344,42 @@ define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
 ; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_cta_param_1];
+; SM60-NEXT:    ld.param.b16 %r7, [seq_cst_seq_cst_i16_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 65535;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM60-NEXT:  $L__BB29_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM60-NEXT:    @%p1 bra $L__BB29_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB29_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
+; SM60-NEXT:    and.b32 %r6, %r5, %r2;
+; SM60-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM60-NEXT:    mov.b32 %r16, %r6;
 ; SM60-NEXT:    @%p2 bra $L__BB29_1;
 ; SM60-NEXT:  $L__BB29_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst
     ret i16 %new
@@ -1899,43 +1899,43 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b32 %r<18>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
 ; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b8 %r9, [acq_rel_acquire_i8_global_param_1];
+; SM60-NEXT:    ld.param.b8 %r7, [acq_rel_acquire_i8_global_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 255;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    and.b32 %r15, %r14, 255;
-; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 255;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    and.b32 %r13, %r12, 255;
+; SM60-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM60-NEXT:    ld.global.b32 %r14, [%rd1];
+; SM60-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM60-NEXT:  $L__BB60_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r17, %r20, %r3;
-; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM60-NEXT:    or.b32 %r15, %r17, %r3;
+; SM60-NEXT:    or.b32 %r16, %r17, %r4;
+; SM60-NEXT:    atom.sys.global.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM60-NEXT:    @%p1 bra $L__BB60_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB60_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    and.b32 %r6, %r5, %r2;
+; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM60-NEXT:    mov.b32 %r17, %r6;
 ; SM60-NEXT:    @%p2 bra $L__BB60_1;
 ; SM60-NEXT:  $L__BB60_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire
     ret i8 %new
@@ -1997,43 +1997,43 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b32 %r<18>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
 ; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b8 %r9, [acq_rel_acquire_i8_generic_cta_param_1];
+; SM60-NEXT:    ld.param.b8 %r7, [acq_rel_acquire_i8_generic_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 255;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    and.b32 %r15, %r14, 255;
-; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.b32 %r16, [%rd1];
-; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 255;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    and.b32 %r13, %r12, 255;
+; SM60-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM60-NEXT:    ld.b32 %r14, [%rd1];
+; SM60-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM60-NEXT:  $L__BB64_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r17, %r20, %r3;
-; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.cta.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM60-NEXT:    or.b32 %r15, %r17, %r3;
+; SM60-NEXT:    or.b32 %r16, %r17, %r4;
+; SM60-NEXT:    atom.cta.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM60-NEXT:    @%p1 bra $L__BB64_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB64_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    and.b32 %r6, %r5, %r2;
+; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM60-NEXT:    mov.b32 %r17, %r6;
 ; SM60-NEXT:    @%p2 bra $L__BB64_1;
 ; SM60-NEXT:  $L__BB64_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
     ret i8 %new
@@ -2044,43 +2044,43 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b32 %r<18>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
 ; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0];
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    ld.param.b8 %r9, [acq_rel_acquire_i8_shared_cta_param_1];
+; SM60-NEXT:    ld.param.b8 %r7, [acq_rel_acquire_i8_shared_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 255;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    and.b32 %r15, %r14, 255;
-; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
-; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM60-NEXT:    and.b32 %r9, %r8, 3;
+; SM60-NEXT:    shl.b32 %r1, %r9, 3;
+; SM60-NEXT:    mov.b32 %r10, 255;
+; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM60-NEXT:    not.b32 %r2, %r11;
+; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM60-NEXT:    and.b32 %r13, %r12, 255;
+; SM60-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM60-NEXT:    ld.shared.b32 %r14, [%rd1];
+; SM60-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM60-NEXT:  $L__BB65_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r17, %r20, %r3;
-; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM60-NEXT:    or.b32 %r15, %r17, %r3;
+; SM60-NEXT:    or.b32 %r16, %r17, %r4;
+; SM60-NEXT:    atom.cta.shared.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM60-NEXT:    @%p1 bra $L__BB65_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB65_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    and.b32 %r6, %r5, %r2;
+; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM60-NEXT:    mov.b32 %r17, %r6;
 ; SM60-NEXT:    @%p2 bra $L__BB65_1;
 ; SM60-NEXT:  $L__BB65_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.cta;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
     ret i8 %new
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll
index 5cb344d5ded84..065b89c7ebf74 100644
--- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll
@@ -7,41 +7,41 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b32 %r<18>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cta_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cta_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r9, [monotonic_monotonic_i8_global_cta_param_1];
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:    ld.param.b8 %r7, [monotonic_monotonic_i8_global_cta_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 255;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    and.b32 %r13, %r12, 255;
+; SM70-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM70-NEXT:    ld.global.b32 %r14, [%rd1];
+; SM70-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM70-NEXT:  $L__BB0_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM70-NEXT:    or.b32 %r15, %r17, %r3;
+; SM70-NEXT:    or.b32 %r16, %r17, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM70-NEXT:    @%p1 bra $L__BB0_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB0_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    and.b32 %r6, %r5, %r2;
+; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM70-NEXT:    mov.b32 %r17, %r6;
 ; SM70-NEXT:    @%p2 bra $L__BB0_1;
 ; SM70-NEXT:  $L__BB0_3: // %partword.cmpxchg.end
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic
     ret i8 %new
@@ -52,42 +52,42 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b32 %r<18>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_global_cta_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_global_cta_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r9, [monotonic_acquire_i8_global_cta_param_1];
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:    ld.param.b8 %r7, [monotonic_acquire_i8_global_cta_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 255;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    and.b32 %r13, %r12, 255;
+; SM70-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM70-NEXT:    ld.global.b32 %r14, [%rd1];
+; SM70-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM70-NEXT:  $L__BB1_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM70-NEXT:    or.b32 %r15, %r17, %r3;
+; SM70-NEXT:    or.b32 %r16, %r17, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM70-NEXT:    @%p1 bra $L__BB1_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB1_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    and.b32 %r6, %r5, %r2;
+; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM70-NEXT:    mov.b32 %r17, %r6;
 ; SM70-NEXT:    @%p2 bra $L__BB1_1;
 ; SM70-NEXT:  $L__BB1_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire
     ret i8 %new
@@ -98,43 +98,43 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b32 %r<18>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0];
 ; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b8 %r9, [monotonic_seq_cst_i8_global_cta_param_1];
+; SM70-NEXT:    ld.param.b8 %r7, [monotonic_seq_cst_i8_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 255;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    and.b32 %r13, %r12, 255;
+; SM70-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM70-NEXT:    ld.global.b32 %r14, [%rd1];
+; SM70-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM70-NEXT:  $L__BB2_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM70-NEXT:    or.b32 %r15, %r17, %r3;
+; SM70-NEXT:    or.b32 %r16, %r17, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM70-NEXT:    @%p1 bra $L__BB2_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB2_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    and.b32 %r6, %r5, %r2;
+; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM70-NEXT:    mov.b32 %r17, %r6;
 ; SM70-NEXT:    @%p2 bra $L__BB2_1;
 ; SM70-NEXT:  $L__BB2_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst
     ret i8 %new
@@ -145,42 +145,42 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b32 %r<18>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_global_cta_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_global_cta_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r9, [acquire_monotonic_i8_global_cta_param_1];
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:    ld.param.b8 %r7, [acquire_monotonic_i8_global_cta_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 255;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    and.b32 %r13, %r12, 255;
+; SM70-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM70-NEXT:    ld.global.b32 %r14, [%rd1];
+; SM70-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM70-NEXT:  $L__BB3_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM70-NEXT:    or.b32 %r15, %r17, %r3;
+; SM70-NEXT:    or.b32 %r16, %r17, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM70-NEXT:    @%p1 bra $L__BB3_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB3_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    and.b32 %r6, %r5, %r2;
+; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM70-NEXT:    mov.b32 %r17, %r6;
 ; SM70-NEXT:    @%p2 bra $L__BB3_1;
 ; SM70-NEXT:  $L__BB3_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic
     ret i8 %new
@@ -191,42 +191,42 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b32 %r<18>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_global_cta_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_global_cta_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r9, [acquire_acquire_i8_global_cta_param_1];
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:    ld.param.b8 %r7, [acquire_acquire_i8_global_cta_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 255;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    and.b32 %r13, %r12, 255;
+; SM70-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM70-NEXT:    ld.global.b32 %r14, [%rd1];
+; SM70-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM70-NEXT:  $L__BB4_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM70-NEXT:    or.b32 %r15, %r17, %r3;
+; SM70-NEXT:    or.b32 %r16, %r17, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM70-NEXT:    @%p1 bra $L__BB4_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB4_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    and.b32 %r6, %r5, %r2;
+; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM70-NEXT:    mov.b32 %r17, %r6;
 ; SM70-NEXT:    @%p2 bra $L__BB4_1;
 ; SM70-NEXT:  $L__BB4_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire
     ret i8 %new
@@ -237,43 +237,43 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b32 %r<18>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cta_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cta_param_0];
 ; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b8 %r9, [acquire_seq_cst_i8_global_cta_param_1];
+; SM70-NEXT:    ld.param.b8 %r7, [acquire_seq_cst_i8_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 255;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    and.b32 %r13, %r12, 255;
+; SM70-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM70-NEXT:    ld.global.b32 %r14, [%rd1];
+; SM70-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM70-NEXT:  $L__BB5_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM70-NEXT:    or.b32 %r15, %r17, %r3;
+; SM70-NEXT:    or.b32 %r16, %r17, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM70-NEXT:    @%p1 bra $L__BB5_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB5_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    and.b32 %r6, %r5, %r2;
+; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM70-NEXT:    mov.b32 %r17, %r6;
 ; SM70-NEXT:    @%p2 bra $L__BB5_1;
 ; SM70-NEXT:  $L__BB5_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst
     ret i8 %new
@@ -284,42 +284,42 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b32 %r<18>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_global_cta_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_global_cta_param_0];
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    ld.param.b8 %r9, [release_monotonic_i8_global_cta_param_1];
+; SM70-NEXT:    ld.param.b8 %r7, [release_monotonic_i8_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 255;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    and.b32 %r13, %r12, 255;
+; SM70-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM70-NEXT:    ld.global.b32 %r14, [%rd1];
+; SM70-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM70-NEXT:  $L__BB6_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM70-NEXT:    or.b32 %r15, %r17, %r3;
+; SM70-NEXT:    or.b32 %r16, %r17, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM70-NEXT:    @%p1 bra $L__BB6_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB6_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    and.b32 %r6, %r5, %r2;
+; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM70-NEXT:    mov.b32 %r17, %r6;
 ; SM70-NEXT:    @%p2 bra $L__BB6_1;
 ; SM70-NEXT:  $L__BB6_3: // %partword.cmpxchg.end
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic
     ret i8 %new
@@ -330,43 +330,43 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b32 %r<18>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_global_cta_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_global_cta_param_0];
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    ld.param.b8 %r9, [release_acquire_i8_global_cta_param_1];
+; SM70-NEXT:    ld.param.b8 %r7, [release_acquire_i8_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 255;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    and.b32 %r13, %r12, 255;
+; SM70-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM70-NEXT:    ld.global.b32 %r14, [%rd1];
+; SM70-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM70-NEXT:  $L__BB7_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM70-NEXT:    or.b32 %r15, %r17, %r3;
+; SM70-NEXT:    or.b32 %r16, %r17, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM70-NEXT:    @%p1 bra $L__BB7_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB7_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    and.b32 %r6, %r5, %r2;
+; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM70-NEXT:    mov.b32 %r17, %r6;
 ; SM70-NEXT:    @%p2 bra $L__BB7_1;
 ; SM70-NEXT:  $L__BB7_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire
     ret i8 %new
@@ -377,43 +377,43 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b32 %r<18>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_global_cta_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_global_cta_param_0];
 ; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b8 %r9, [release_seq_cst_i8_global_cta_param_1];
+; SM70-NEXT:    ld.param.b8 %r7, [release_seq_cst_i8_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 255;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    and.b32 %r13, %r12, 255;
+; SM70-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM70-NEXT:    ld.global.b32 %r14, [%rd1];
+; SM70-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM70-NEXT:  $L__BB8_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM70-NEXT:    or.b32 %r15, %r17, %r3;
+; SM70-NEXT:    or.b32 %r16, %r17, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM70-NEXT:    @%p1 bra $L__BB8_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB8_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    and.b32 %r6, %r5, %r2;
+; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM70-NEXT:    mov.b32 %r17, %r6;
 ; SM70-NEXT:    @%p2 bra $L__BB8_1;
 ; SM70-NEXT:  $L__BB8_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst
     ret i8 %new
@@ -424,43 +424,43 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b32 %r<18>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0];
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    ld.param.b8 %r9, [acq_rel_monotonic_i8_global_cta_param_1];
+; SM70-NEXT:    ld.param.b8 %r7, [acq_rel_monotonic_i8_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 255;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    and.b32 %r13, %r12, 255;
+; SM70-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM70-NEXT:    ld.global.b32 %r14, [%rd1];
+; SM70-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM70-NEXT:  $L__BB9_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM70-NEXT:    or.b32 %r15, %r17, %r3;
+; SM70-NEXT:    or.b32 %r16, %r17, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM70-NEXT:    @%p1 bra $L__BB9_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB9_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    and.b32 %r6, %r5, %r2;
+; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM70-NEXT:    mov.b32 %r17, %r6;
 ; SM70-NEXT:    @%p2 bra $L__BB9_1;
 ; SM70-NEXT:  $L__BB9_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic
     ret i8 %new
@@ -471,43 +471,43 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b32 %r<18>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cta_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cta_param_0];
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    ld.param.b8 %r9, [acq_rel_acquire_i8_global_cta_param_1];
+; SM70-NEXT:    ld.param.b8 %r7, [acq_rel_acquire_i8_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 255;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    and.b32 %r13, %r12, 255;
+; SM70-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM70-NEXT:    ld.global.b32 %r14, [%rd1];
+; SM70-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM70-NEXT:  $L__BB10_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM70-NEXT:    or.b32 %r15, %r17, %r3;
+; SM70-NEXT:    or.b32 %r16, %r17, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM70-NEXT:    @%p1 bra $L__BB10_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB10_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    and.b32 %r6, %r5, %r2;
+; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM70-NEXT:    mov.b32 %r17, %r6;
 ; SM70-NEXT:    @%p2 bra $L__BB10_1;
 ; SM70-NEXT:  $L__BB10_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
     ret i8 %new
@@ -518,43 +518,43 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b32 %r<18>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0];
 ; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_cta_param_1];
+; SM70-NEXT:    ld.param.b8 %r7, [acq_rel_seq_cst_i8_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 255;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    and.b32 %r13, %r12, 255;
+; SM70-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM70-NEXT:    ld.global.b32 %r14, [%rd1];
+; SM70-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM70-NEXT:  $L__BB11_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM70-NEXT:    or.b32 %r15, %r17, %r3;
+; SM70-NEXT:    or.b32 %r16, %r17, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM70-NEXT:    @%p1 bra $L__BB11_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB11_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    and.b32 %r6, %r5, %r2;
+; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM70-NEXT:    mov.b32 %r17, %r6;
 ; SM70-NEXT:    @%p2 bra $L__BB11_1;
 ; SM70-NEXT:  $L__BB11_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst
     ret i8 %new
@@ -565,43 +565,43 @@ define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b32 %r<18>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0];
 ; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b8 %r9, [seq_cst_monotonic_i8_global_cta_param_1];
+; SM70-NEXT:    ld.param.b8 %r7, [seq_cst_monotonic_i8_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 255;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    and.b32 %r13, %r12, 255;
+; SM70-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM70-NEXT:    ld.global.b32 %r14, [%rd1];
+; SM70-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM70-NEXT:  $L__BB12_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM70-NEXT:    or.b32 %r15, %r17, %r3;
+; SM70-NEXT:    or.b32 %r16, %r17, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM70-NEXT:    @%p1 bra $L__BB12_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB12_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    and.b32 %r6, %r5, %r2;
+; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM70-NEXT:    mov.b32 %r17, %r6;
 ; SM70-NEXT:    @%p2 bra $L__BB12_1;
 ; SM70-NEXT:  $L__BB12_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic
     ret i8 %new
@@ -612,43 +612,43 @@ define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b32 %r<18>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cta_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cta_param_0];
 ; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b8 %r9, [seq_cst_acquire_i8_global_cta_param_1];
+; SM70-NEXT:    ld.param.b8 %r7, [seq_cst_acquire_i8_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 255;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    and.b32 %r13, %r12, 255;
+; SM70-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM70-NEXT:    ld.global.b32 %r14, [%rd1];
+; SM70-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM70-NEXT:  $L__BB13_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM70-NEXT:    or.b32 %r15, %r17, %r3;
+; SM70-NEXT:    or.b32 %r16, %r17, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM70-NEXT:    @%p1 bra $L__BB13_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB13_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    and.b32 %r6, %r5, %r2;
+; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM70-NEXT:    mov.b32 %r17, %r6;
 ; SM70-NEXT:    @%p2 bra $L__BB13_1;
 ; SM70-NEXT:  $L__BB13_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire
     ret i8 %new
@@ -659,43 +659,43 @@ define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b32 %r<18>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0];
 ; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_cta_param_1];
+; SM70-NEXT:    ld.param.b8 %r7, [seq_cst_seq_cst_i8_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 255;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    and.b32 %r13, %r12, 255;
+; SM70-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM70-NEXT:    ld.global.b32 %r14, [%rd1];
+; SM70-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM70-NEXT:  $L__BB14_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM70-NEXT:    or.b32 %r15, %r17, %r3;
+; SM70-NEXT:    or.b32 %r16, %r17, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM70-NEXT:    @%p1 bra $L__BB14_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB14_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    and.b32 %r6, %r5, %r2;
+; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM70-NEXT:    mov.b32 %r17, %r6;
 ; SM70-NEXT:    @%p2 bra $L__BB14_1;
 ; SM70-NEXT:  $L__BB14_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst
     ret i8 %new
@@ -706,40 +706,40 @@ define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp,
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_global_cta_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_global_cta_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_global_cta_param_1];
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    ld.param.b16 %r7, [monotonic_monotonic_i16_global_cta_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 65535;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM70-NEXT:  $L__BB15_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM70-NEXT:    @%p1 bra $L__BB15_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB15_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    and.b32 %r6, %r5, %r2;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
 ; SM70-NEXT:    @%p2 bra $L__BB15_1;
 ; SM70-NEXT:  $L__BB15_3: // %partword.cmpxchg.end
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic
     ret i16 %new
@@ -750,41 +750,41 @@ define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_global_cta_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_global_cta_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_global_cta_param_1];
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    ld.param.b16 %r7, [monotonic_acquire_i16_global_cta_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 65535;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM70-NEXT:  $L__BB16_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM70-NEXT:    @%p1 bra $L__BB16_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB16_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    and.b32 %r6, %r5, %r2;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
 ; SM70-NEXT:    @%p2 bra $L__BB16_1;
 ; SM70-NEXT:  $L__BB16_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire
     ret i16 %new
@@ -795,42 +795,42 @@ define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0];
 ; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_global_cta_param_1];
+; SM70-NEXT:    ld.param.b16 %r7, [monotonic_seq_cst_i16_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 65535;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM70-NEXT:  $L__BB17_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM70-NEXT:    @%p1 bra $L__BB17_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB17_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    and.b32 %r6, %r5, %r2;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
 ; SM70-NEXT:    @%p2 bra $L__BB17_1;
 ; SM70-NEXT:  $L__BB17_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst
     ret i16 %new
@@ -841,41 +841,41 @@ define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_global_cta_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_global_cta_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_global_cta_param_1];
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    ld.param.b16 %r7, [acquire_monotonic_i16_global_cta_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 65535;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM70-NEXT:  $L__BB18_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM70-NEXT:    @%p1 bra $L__BB18_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB18_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    and.b32 %r6, %r5, %r2;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
 ; SM70-NEXT:    @%p2 bra $L__BB18_1;
 ; SM70-NEXT:  $L__BB18_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic
     ret i16 %new
@@ -886,41 +886,41 @@ define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_global_cta_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_global_cta_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_global_cta_param_1];
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    ld.param.b16 %r7, [acquire_acquire_i16_global_cta_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 65535;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM70-NEXT:  $L__BB19_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM70-NEXT:    @%p1 bra $L__BB19_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB19_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    and.b32 %r6, %r5, %r2;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
 ; SM70-NEXT:    @%p2 bra $L__BB19_1;
 ; SM70-NEXT:  $L__BB19_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire
     ret i16 %new
@@ -931,42 +931,42 @@ define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_global_cta_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_global_cta_param_0];
 ; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_global_cta_param_1];
+; SM70-NEXT:    ld.param.b16 %r7, [acquire_seq_cst_i16_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 65535;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM70-NEXT:  $L__BB20_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM70-NEXT:    @%p1 bra $L__BB20_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB20_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    and.b32 %r6, %r5, %r2;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
 ; SM70-NEXT:    @%p2 bra $L__BB20_1;
 ; SM70-NEXT:  $L__BB20_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst
     ret i16 %new
@@ -977,41 +977,41 @@ define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_global_cta_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_global_cta_param_0];
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_global_cta_param_1];
+; SM70-NEXT:    ld.param.b16 %r7, [release_monotonic_i16_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 65535;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM70-NEXT:  $L__BB21_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM70-NEXT:    @%p1 bra $L__BB21_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB21_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    and.b32 %r6, %r5, %r2;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
 ; SM70-NEXT:    @%p2 bra $L__BB21_1;
 ; SM70-NEXT:  $L__BB21_3: // %partword.cmpxchg.end
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic
     ret i16 %new
@@ -1022,42 +1022,42 @@ define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_global_cta_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_global_cta_param_0];
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    ld.param.b16 %r9, [release_acquire_i16_global_cta_param_1];
+; SM70-NEXT:    ld.param.b16 %r7, [release_acquire_i16_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 65535;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM70-NEXT:  $L__BB22_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM70-NEXT:    @%p1 bra $L__BB22_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB22_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    and.b32 %r6, %r5, %r2;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
 ; SM70-NEXT:    @%p2 bra $L__BB22_1;
 ; SM70-NEXT:  $L__BB22_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release acquire
     ret i16 %new
@@ -1068,42 +1068,42 @@ define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_global_cta_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_global_cta_param_0];
 ; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_global_cta_param_1];
+; SM70-NEXT:    ld.param.b16 %r7, [release_seq_cst_i16_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 65535;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM70-NEXT:  $L__BB23_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM70-NEXT:    @%p1 bra $L__BB23_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB23_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    and.b32 %r6, %r5, %r2;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
 ; SM70-NEXT:    @%p2 bra $L__BB23_1;
 ; SM70-NEXT:  $L__BB23_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst
     ret i16 %new
@@ -1114,42 +1114,42 @@ define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0];
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_global_cta_param_1];
+; SM70-NEXT:    ld.param.b16 %r7, [acq_rel_monotonic_i16_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 65535;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM70-NEXT:  $L__BB24_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM70-NEXT:    @%p1 bra $L__BB24_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB24_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    and.b32 %r6, %r5, %r2;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
 ; SM70-NEXT:    @%p2 bra $L__BB24_1;
 ; SM70-NEXT:  $L__BB24_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic
     ret i16 %new
@@ -1160,42 +1160,42 @@ define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_global_cta_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_global_cta_param_0];
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_global_cta_param_1];
+; SM70-NEXT:    ld.param.b16 %r7, [acq_rel_acquire_i16_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 65535;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM70-NEXT:  $L__BB25_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM70-NEXT:    @%p1 bra $L__BB25_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB25_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    and.b32 %r6, %r5, %r2;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
 ; SM70-NEXT:    @%p2 bra $L__BB25_1;
 ; SM70-NEXT:  $L__BB25_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire
     ret i16 %new
@@ -1206,42 +1206,42 @@ define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0];
 ; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_cta_param_1];
+; SM70-NEXT:    ld.param.b16 %r7, [acq_rel_seq_cst_i16_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 65535;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM70-NEXT:  $L__BB26_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM70-NEXT:    @%p1 bra $L__BB26_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB26_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    and.b32 %r6, %r5, %r2;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
 ; SM70-NEXT:    @%p2 bra $L__BB26_1;
 ; SM70-NEXT:  $L__BB26_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst
     ret i16 %new
@@ -1252,42 +1252,42 @@ define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0];
 ; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_global_cta_param_1];
+; SM70-NEXT:    ld.param.b16 %r7, [seq_cst_monotonic_i16_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 65535;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM70-NEXT:  $L__BB27_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM70-NEXT:    @%p1 bra $L__BB27_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB27_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    and.b32 %r6, %r5, %r2;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
 ; SM70-NEXT:    @%p2 bra $L__BB27_1;
 ; SM70-NEXT:  $L__BB27_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic
     ret i16 %new
@@ -1298,42 +1298,42 @@ define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_global_cta_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_global_cta_param_0];
 ; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_global_cta_param_1];
+; SM70-NEXT:    ld.param.b16 %r7, [seq_cst_acquire_i16_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 65535;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM70-NEXT:  $L__BB28_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM70-NEXT:    @%p1 bra $L__BB28_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB28_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    and.b32 %r6, %r5, %r2;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
 ; SM70-NEXT:    @%p2 bra $L__BB28_1;
 ; SM70-NEXT:  $L__BB28_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire
     ret i16 %new
@@ -1344,42 +1344,42 @@ define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0];
 ; SM70-NEXT:    fence.sc.cta;
-; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_cta_param_1];
+; SM70-NEXT:    ld.param.b16 %r7, [seq_cst_seq_cst_i16_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 65535;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM70-NEXT:    ld.global.b32 %r13, [%rd1];
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM70-NEXT:  $L__BB29_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM70-NEXT:    @%p1 bra $L__BB29_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB29_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    and.b32 %r6, %r5, %r2;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
 ; SM70-NEXT:    @%p2 bra $L__BB29_1;
 ; SM70-NEXT:  $L__BB29_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst
     ret i16 %new
@@ -1899,43 +1899,43 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b32 %r<18>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0];
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.b8 %r9, [acq_rel_acquire_i8_global_param_1];
+; SM70-NEXT:    ld.param.b8 %r7, [acq_rel_acquire_i8_global_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 255;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    and.b32 %r13, %r12, 255;
+; SM70-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM70-NEXT:    ld.global.b32 %r14, [%rd1];
+; SM70-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM70-NEXT:  $L__BB60_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM70-NEXT:    or.b32 %r15, %r17, %r3;
+; SM70-NEXT:    or.b32 %r16, %r17, %r4;
+; SM70-NEXT:    atom.relaxed.sys.global.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM70-NEXT:    @%p1 bra $L__BB60_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB60_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    and.b32 %r6, %r5, %r2;
+; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM70-NEXT:    mov.b32 %r17, %r6;
 ; SM70-NEXT:    @%p2 bra $L__BB60_1;
 ; SM70-NEXT:  $L__BB60_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire
     ret i8 %new
@@ -1997,43 +1997,43 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b32 %r<18>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0];
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    ld.param.b8 %r9, [acq_rel_acquire_i8_generic_cta_param_1];
+; SM70-NEXT:    ld.param.b8 %r7, [acq_rel_acquire_i8_generic_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 255;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    and.b32 %r13, %r12, 255;
+; SM70-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM70-NEXT:    ld.b32 %r14, [%rd1];
+; SM70-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM70-NEXT:  $L__BB64_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM70-NEXT:    or.b32 %r15, %r17, %r3;
+; SM70-NEXT:    or.b32 %r16, %r17, %r4;
+; SM70-NEXT:    atom.relaxed.cta.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM70-NEXT:    @%p1 bra $L__BB64_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB64_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    and.b32 %r6, %r5, %r2;
+; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM70-NEXT:    mov.b32 %r17, %r6;
 ; SM70-NEXT:    @%p2 bra $L__BB64_1;
 ; SM70-NEXT:  $L__BB64_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
     ret i8 %new
@@ -2044,43 +2044,43 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b32 %r<18>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0];
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    ld.param.b8 %r9, [acq_rel_acquire_i8_shared_cta_param_1];
+; SM70-NEXT:    ld.param.b8 %r7, [acq_rel_acquire_i8_shared_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 255;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    and.b32 %r13, %r12, 255;
+; SM70-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM70-NEXT:    ld.shared.b32 %r14, [%rd1];
+; SM70-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM70-NEXT:  $L__BB65_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM70-NEXT:    or.b32 %r15, %r17, %r3;
+; SM70-NEXT:    or.b32 %r16, %r17, %r4;
+; SM70-NEXT:    atom.relaxed.cta.shared.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM70-NEXT:    @%p1 bra $L__BB65_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB65_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    and.b32 %r6, %r5, %r2;
+; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM70-NEXT:    mov.b32 %r17, %r6;
 ; SM70-NEXT:    @%p2 bra $L__BB65_1;
 ; SM70-NEXT:  $L__BB65_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.cta;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
     ret i8 %new
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll
index 7cb259023d6dd..e4433570bdd70 100644
--- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll
@@ -7,41 +7,41 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b32 %r<18>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
 ; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cta_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cta_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r9, [monotonic_monotonic_i8_global_cta_param_1];
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 255;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    and.b32 %r15, %r14, 255;
-; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:    ld.param.b8 %r7, [monotonic_monotonic_i8_global_cta_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 255;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    and.b32 %r13, %r12, 255;
+; SM90-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM90-NEXT:    ld.global.b32 %r14, [%rd1];
+; SM90-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM90-NEXT:  $L__BB0_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r17, %r20, %r3;
-; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM90-NEXT:    or.b32 %r15, %r17, %r3;
+; SM90-NEXT:    or.b32 %r16, %r17, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM90-NEXT:    @%p1 bra $L__BB0_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB0_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    and.b32 %r6, %r5, %r2;
+; SM90-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM90-NEXT:    mov.b32 %r17, %r6;
 ; SM90-NEXT:    @%p2 bra $L__BB0_1;
 ; SM90-NEXT:  $L__BB0_3: // %partword.cmpxchg.end
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic
     ret i8 %new
@@ -52,42 +52,42 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b32 %r<18>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
 ; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_global_cta_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_global_cta_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r9, [monotonic_acquire_i8_global_cta_param_1];
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 255;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    and.b32 %r15, %r14, 255;
-; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:    ld.param.b8 %r7, [monotonic_acquire_i8_global_cta_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 255;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    and.b32 %r13, %r12, 255;
+; SM90-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM90-NEXT:    ld.global.b32 %r14, [%rd1];
+; SM90-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM90-NEXT:  $L__BB1_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r17, %r20, %r3;
-; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM90-NEXT:    or.b32 %r15, %r17, %r3;
+; SM90-NEXT:    or.b32 %r16, %r17, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM90-NEXT:    @%p1 bra $L__BB1_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB1_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    and.b32 %r6, %r5, %r2;
+; SM90-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM90-NEXT:    mov.b32 %r17, %r6;
 ; SM90-NEXT:    @%p2 bra $L__BB1_1;
 ; SM90-NEXT:  $L__BB1_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire
     ret i8 %new
@@ -98,43 +98,43 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b32 %r<18>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
 ; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0];
 ; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b8 %r9, [monotonic_seq_cst_i8_global_cta_param_1];
+; SM90-NEXT:    ld.param.b8 %r7, [monotonic_seq_cst_i8_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 255;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    and.b32 %r15, %r14, 255;
-; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 255;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    and.b32 %r13, %r12, 255;
+; SM90-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM90-NEXT:    ld.global.b32 %r14, [%rd1];
+; SM90-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM90-NEXT:  $L__BB2_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r17, %r20, %r3;
-; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM90-NEXT:    or.b32 %r15, %r17, %r3;
+; SM90-NEXT:    or.b32 %r16, %r17, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM90-NEXT:    @%p1 bra $L__BB2_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB2_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    and.b32 %r6, %r5, %r2;
+; SM90-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM90-NEXT:    mov.b32 %r17, %r6;
 ; SM90-NEXT:    @%p2 bra $L__BB2_1;
 ; SM90-NEXT:  $L__BB2_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst
     ret i8 %new
@@ -145,42 +145,42 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b32 %r<18>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
 ; SM90-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_global_cta_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_global_cta_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r9, [acquire_monotonic_i8_global_cta_param_1];
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 255;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    and.b32 %r15, %r14, 255;
-; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:    ld.param.b8 %r7, [acquire_monotonic_i8_global_cta_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 255;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    and.b32 %r13, %r12, 255;
+; SM90-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM90-NEXT:    ld.global.b32 %r14, [%rd1];
+; SM90-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM90-NEXT:  $L__BB3_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r17, %r20, %r3;
-; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM90-NEXT:    or.b32 %r15, %r17, %r3;
+; SM90-NEXT:    or.b32 %r16, %r17, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM90-NEXT:    @%p1 bra $L__BB3_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB3_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    and.b32 %r6, %r5, %r2;
+; SM90-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM90-NEXT:    mov.b32 %r17, %r6;
 ; SM90-NEXT:    @%p2 bra $L__BB3_1;
 ; SM90-NEXT:  $L__BB3_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic
     ret i8 %new
@@ -191,42 +191,42 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b32 %r<18>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
 ; SM90-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_global_cta_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_global_cta_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r9, [acquire_acquire_i8_global_cta_param_1];
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 255;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    and.b32 %r15, %r14, 255;
-; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:    ld.param.b8 %r7, [acquire_acquire_i8_global_cta_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 255;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    and.b32 %r13, %r12, 255;
+; SM90-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM90-NEXT:    ld.global.b32 %r14, [%rd1];
+; SM90-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM90-NEXT:  $L__BB4_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r17, %r20, %r3;
-; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM90-NEXT:    or.b32 %r15, %r17, %r3;
+; SM90-NEXT:    or.b32 %r16, %r17, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM90-NEXT:    @%p1 bra $L__BB4_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB4_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    and.b32 %r6, %r5, %r2;
+; SM90-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM90-NEXT:    mov.b32 %r17, %r6;
 ; SM90-NEXT:    @%p2 bra $L__BB4_1;
 ; SM90-NEXT:  $L__BB4_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire
     ret i8 %new
@@ -237,43 +237,43 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b32 %r<18>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
 ; SM90-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cta_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cta_param_0];
 ; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b8 %r9, [acquire_seq_cst_i8_global_cta_param_1];
+; SM90-NEXT:    ld.param.b8 %r7, [acquire_seq_cst_i8_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 255;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    and.b32 %r15, %r14, 255;
-; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 255;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    and.b32 %r13, %r12, 255;
+; SM90-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM90-NEXT:    ld.global.b32 %r14, [%rd1];
+; SM90-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM90-NEXT:  $L__BB5_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r17, %r20, %r3;
-; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM90-NEXT:    or.b32 %r15, %r17, %r3;
+; SM90-NEXT:    or.b32 %r16, %r17, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM90-NEXT:    @%p1 bra $L__BB5_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB5_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    and.b32 %r6, %r5, %r2;
+; SM90-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM90-NEXT:    mov.b32 %r17, %r6;
 ; SM90-NEXT:    @%p2 bra $L__BB5_1;
 ; SM90-NEXT:  $L__BB5_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst
     ret i8 %new
@@ -284,42 +284,42 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b32 %r<18>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
 ; SM90-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_global_cta_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_global_cta_param_0];
 ; SM90-NEXT:    fence.release.cta;
-; SM90-NEXT:    ld.param.b8 %r9, [release_monotonic_i8_global_cta_param_1];
+; SM90-NEXT:    ld.param.b8 %r7, [release_monotonic_i8_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 255;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    and.b32 %r15, %r14, 255;
-; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 255;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    and.b32 %r13, %r12, 255;
+; SM90-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM90-NEXT:    ld.global.b32 %r14, [%rd1];
+; SM90-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM90-NEXT:  $L__BB6_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r17, %r20, %r3;
-; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM90-NEXT:    or.b32 %r15, %r17, %r3;
+; SM90-NEXT:    or.b32 %r16, %r17, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM90-NEXT:    @%p1 bra $L__BB6_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB6_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    and.b32 %r6, %r5, %r2;
+; SM90-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM90-NEXT:    mov.b32 %r17, %r6;
 ; SM90-NEXT:    @%p2 bra $L__BB6_1;
 ; SM90-NEXT:  $L__BB6_3: // %partword.cmpxchg.end
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic
     ret i8 %new
@@ -330,43 +330,43 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b32 %r<18>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
 ; SM90-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_global_cta_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_global_cta_param_0];
 ; SM90-NEXT:    fence.release.cta;
-; SM90-NEXT:    ld.param.b8 %r9, [release_acquire_i8_global_cta_param_1];
+; SM90-NEXT:    ld.param.b8 %r7, [release_acquire_i8_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 255;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    and.b32 %r15, %r14, 255;
-; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 255;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    and.b32 %r13, %r12, 255;
+; SM90-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM90-NEXT:    ld.global.b32 %r14, [%rd1];
+; SM90-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM90-NEXT:  $L__BB7_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r17, %r20, %r3;
-; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM90-NEXT:    or.b32 %r15, %r17, %r3;
+; SM90-NEXT:    or.b32 %r16, %r17, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM90-NEXT:    @%p1 bra $L__BB7_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB7_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    and.b32 %r6, %r5, %r2;
+; SM90-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM90-NEXT:    mov.b32 %r17, %r6;
 ; SM90-NEXT:    @%p2 bra $L__BB7_1;
 ; SM90-NEXT:  $L__BB7_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire
     ret i8 %new
@@ -377,43 +377,43 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b32 %r<18>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
 ; SM90-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_global_cta_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_global_cta_param_0];
 ; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b8 %r9, [release_seq_cst_i8_global_cta_param_1];
+; SM90-NEXT:    ld.param.b8 %r7, [release_seq_cst_i8_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 255;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    and.b32 %r15, %r14, 255;
-; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 255;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    and.b32 %r13, %r12, 255;
+; SM90-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM90-NEXT:    ld.global.b32 %r14, [%rd1];
+; SM90-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM90-NEXT:  $L__BB8_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r17, %r20, %r3;
-; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM90-NEXT:    or.b32 %r15, %r17, %r3;
+; SM90-NEXT:    or.b32 %r16, %r17, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM90-NEXT:    @%p1 bra $L__BB8_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB8_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    and.b32 %r6, %r5, %r2;
+; SM90-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM90-NEXT:    mov.b32 %r17, %r6;
 ; SM90-NEXT:    @%p2 bra $L__BB8_1;
 ; SM90-NEXT:  $L__BB8_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst
     ret i8 %new
@@ -424,43 +424,43 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b32 %r<18>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
 ; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0];
 ; SM90-NEXT:    fence.release.cta;
-; SM90-NEXT:    ld.param.b8 %r9, [acq_rel_monotonic_i8_global_cta_param_1];
+; SM90-NEXT:    ld.param.b8 %r7, [acq_rel_monotonic_i8_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 255;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    and.b32 %r15, %r14, 255;
-; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 255;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    and.b32 %r13, %r12, 255;
+; SM90-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM90-NEXT:    ld.global.b32 %r14, [%rd1];
+; SM90-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM90-NEXT:  $L__BB9_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r17, %r20, %r3;
-; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM90-NEXT:    or.b32 %r15, %r17, %r3;
+; SM90-NEXT:    or.b32 %r16, %r17, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM90-NEXT:    @%p1 bra $L__BB9_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB9_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    and.b32 %r6, %r5, %r2;
+; SM90-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM90-NEXT:    mov.b32 %r17, %r6;
 ; SM90-NEXT:    @%p2 bra $L__BB9_1;
 ; SM90-NEXT:  $L__BB9_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic
     ret i8 %new
@@ -471,43 +471,43 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b32 %r<18>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
 ; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cta_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cta_param_0];
 ; SM90-NEXT:    fence.release.cta;
-; SM90-NEXT:    ld.param.b8 %r9, [acq_rel_acquire_i8_global_cta_param_1];
+; SM90-NEXT:    ld.param.b8 %r7, [acq_rel_acquire_i8_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 255;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    and.b32 %r15, %r14, 255;
-; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 255;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    and.b32 %r13, %r12, 255;
+; SM90-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM90-NEXT:    ld.global.b32 %r14, [%rd1];
+; SM90-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM90-NEXT:  $L__BB10_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r17, %r20, %r3;
-; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM90-NEXT:    or.b32 %r15, %r17, %r3;
+; SM90-NEXT:    or.b32 %r16, %r17, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM90-NEXT:    @%p1 bra $L__BB10_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB10_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    and.b32 %r6, %r5, %r2;
+; SM90-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM90-NEXT:    mov.b32 %r17, %r6;
 ; SM90-NEXT:    @%p2 bra $L__BB10_1;
 ; SM90-NEXT:  $L__BB10_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
     ret i8 %new
@@ -518,43 +518,43 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b32 %r<18>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
 ; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0];
 ; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_cta_param_1];
+; SM90-NEXT:    ld.param.b8 %r7, [acq_rel_seq_cst_i8_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 255;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    and.b32 %r15, %r14, 255;
-; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 255;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    and.b32 %r13, %r12, 255;
+; SM90-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM90-NEXT:    ld.global.b32 %r14, [%rd1];
+; SM90-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM90-NEXT:  $L__BB11_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r17, %r20, %r3;
-; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM90-NEXT:    or.b32 %r15, %r17, %r3;
+; SM90-NEXT:    or.b32 %r16, %r17, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM90-NEXT:    @%p1 bra $L__BB11_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB11_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    and.b32 %r6, %r5, %r2;
+; SM90-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM90-NEXT:    mov.b32 %r17, %r6;
 ; SM90-NEXT:    @%p2 bra $L__BB11_1;
 ; SM90-NEXT:  $L__BB11_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst
     ret i8 %new
@@ -565,43 +565,43 @@ define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b32 %r<18>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
 ; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0];
 ; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b8 %r9, [seq_cst_monotonic_i8_global_cta_param_1];
+; SM90-NEXT:    ld.param.b8 %r7, [seq_cst_monotonic_i8_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 255;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    and.b32 %r15, %r14, 255;
-; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 255;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    and.b32 %r13, %r12, 255;
+; SM90-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM90-NEXT:    ld.global.b32 %r14, [%rd1];
+; SM90-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM90-NEXT:  $L__BB12_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r17, %r20, %r3;
-; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM90-NEXT:    or.b32 %r15, %r17, %r3;
+; SM90-NEXT:    or.b32 %r16, %r17, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM90-NEXT:    @%p1 bra $L__BB12_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB12_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    and.b32 %r6, %r5, %r2;
+; SM90-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM90-NEXT:    mov.b32 %r17, %r6;
 ; SM90-NEXT:    @%p2 bra $L__BB12_1;
 ; SM90-NEXT:  $L__BB12_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic
     ret i8 %new
@@ -612,43 +612,43 @@ define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b32 %r<18>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
 ; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cta_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cta_param_0];
 ; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b8 %r9, [seq_cst_acquire_i8_global_cta_param_1];
+; SM90-NEXT:    ld.param.b8 %r7, [seq_cst_acquire_i8_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 255;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    and.b32 %r15, %r14, 255;
-; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 255;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    and.b32 %r13, %r12, 255;
+; SM90-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM90-NEXT:    ld.global.b32 %r14, [%rd1];
+; SM90-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM90-NEXT:  $L__BB13_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r17, %r20, %r3;
-; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM90-NEXT:    or.b32 %r15, %r17, %r3;
+; SM90-NEXT:    or.b32 %r16, %r17, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM90-NEXT:    @%p1 bra $L__BB13_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB13_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    and.b32 %r6, %r5, %r2;
+; SM90-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM90-NEXT:    mov.b32 %r17, %r6;
 ; SM90-NEXT:    @%p2 bra $L__BB13_1;
 ; SM90-NEXT:  $L__BB13_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire
     ret i8 %new
@@ -659,43 +659,43 @@ define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b32 %r<18>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
 ; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0];
 ; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_cta_param_1];
+; SM90-NEXT:    ld.param.b8 %r7, [seq_cst_seq_cst_i8_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 255;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    and.b32 %r15, %r14, 255;
-; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 255;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    and.b32 %r13, %r12, 255;
+; SM90-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM90-NEXT:    ld.global.b32 %r14, [%rd1];
+; SM90-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM90-NEXT:  $L__BB14_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r17, %r20, %r3;
-; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM90-NEXT:    or.b32 %r15, %r17, %r3;
+; SM90-NEXT:    or.b32 %r16, %r17, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM90-NEXT:    @%p1 bra $L__BB14_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB14_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    and.b32 %r6, %r5, %r2;
+; SM90-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM90-NEXT:    mov.b32 %r17, %r6;
 ; SM90-NEXT:    @%p2 bra $L__BB14_1;
 ; SM90-NEXT:  $L__BB14_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst
     ret i8 %new
@@ -706,40 +706,40 @@ define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp,
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<17>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
 ; SM90-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_global_cta_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_global_cta_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_global_cta_param_1];
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    ld.param.b16 %r7, [monotonic_monotonic_i16_global_cta_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 65535;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
+; SM90-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM90-NEXT:  $L__BB15_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r14, %r16, %r3;
+; SM90-NEXT:    or.b32 %r15, %r16, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM90-NEXT:    @%p1 bra $L__BB15_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB15_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    and.b32 %r6, %r5, %r2;
+; SM90-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM90-NEXT:    mov.b32 %r16, %r6;
 ; SM90-NEXT:    @%p2 bra $L__BB15_1;
 ; SM90-NEXT:  $L__BB15_3: // %partword.cmpxchg.end
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic
     ret i16 %new
@@ -750,41 +750,41 @@ define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<17>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
 ; SM90-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_global_cta_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_global_cta_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_global_cta_param_1];
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    ld.param.b16 %r7, [monotonic_acquire_i16_global_cta_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 65535;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
+; SM90-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM90-NEXT:  $L__BB16_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r14, %r16, %r3;
+; SM90-NEXT:    or.b32 %r15, %r16, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM90-NEXT:    @%p1 bra $L__BB16_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB16_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    and.b32 %r6, %r5, %r2;
+; SM90-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM90-NEXT:    mov.b32 %r16, %r6;
 ; SM90-NEXT:    @%p2 bra $L__BB16_1;
 ; SM90-NEXT:  $L__BB16_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire
     ret i16 %new
@@ -795,42 +795,42 @@ define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<17>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
 ; SM90-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0];
 ; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_global_cta_param_1];
+; SM90-NEXT:    ld.param.b16 %r7, [monotonic_seq_cst_i16_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 65535;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
+; SM90-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM90-NEXT:  $L__BB17_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r14, %r16, %r3;
+; SM90-NEXT:    or.b32 %r15, %r16, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM90-NEXT:    @%p1 bra $L__BB17_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB17_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    and.b32 %r6, %r5, %r2;
+; SM90-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM90-NEXT:    mov.b32 %r16, %r6;
 ; SM90-NEXT:    @%p2 bra $L__BB17_1;
 ; SM90-NEXT:  $L__BB17_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst
     ret i16 %new
@@ -841,41 +841,41 @@ define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<17>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
 ; SM90-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_global_cta_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_global_cta_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_global_cta_param_1];
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    ld.param.b16 %r7, [acquire_monotonic_i16_global_cta_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 65535;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
+; SM90-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM90-NEXT:  $L__BB18_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r14, %r16, %r3;
+; SM90-NEXT:    or.b32 %r15, %r16, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM90-NEXT:    @%p1 bra $L__BB18_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB18_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    and.b32 %r6, %r5, %r2;
+; SM90-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM90-NEXT:    mov.b32 %r16, %r6;
 ; SM90-NEXT:    @%p2 bra $L__BB18_1;
 ; SM90-NEXT:  $L__BB18_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic
     ret i16 %new
@@ -886,41 +886,41 @@ define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<17>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
 ; SM90-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_global_cta_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_global_cta_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_global_cta_param_1];
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    ld.param.b16 %r7, [acquire_acquire_i16_global_cta_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 65535;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
+; SM90-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM90-NEXT:  $L__BB19_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r14, %r16, %r3;
+; SM90-NEXT:    or.b32 %r15, %r16, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM90-NEXT:    @%p1 bra $L__BB19_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB19_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    and.b32 %r6, %r5, %r2;
+; SM90-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM90-NEXT:    mov.b32 %r16, %r6;
 ; SM90-NEXT:    @%p2 bra $L__BB19_1;
 ; SM90-NEXT:  $L__BB19_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire
     ret i16 %new
@@ -931,42 +931,42 @@ define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<17>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
 ; SM90-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_global_cta_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_global_cta_param_0];
 ; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_global_cta_param_1];
+; SM90-NEXT:    ld.param.b16 %r7, [acquire_seq_cst_i16_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 65535;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
+; SM90-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM90-NEXT:  $L__BB20_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r14, %r16, %r3;
+; SM90-NEXT:    or.b32 %r15, %r16, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM90-NEXT:    @%p1 bra $L__BB20_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB20_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    and.b32 %r6, %r5, %r2;
+; SM90-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM90-NEXT:    mov.b32 %r16, %r6;
 ; SM90-NEXT:    @%p2 bra $L__BB20_1;
 ; SM90-NEXT:  $L__BB20_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst
     ret i16 %new
@@ -977,41 +977,41 @@ define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<17>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
 ; SM90-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_global_cta_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_global_cta_param_0];
 ; SM90-NEXT:    fence.release.cta;
-; SM90-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_global_cta_param_1];
+; SM90-NEXT:    ld.param.b16 %r7, [release_monotonic_i16_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 65535;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
+; SM90-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM90-NEXT:  $L__BB21_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r14, %r16, %r3;
+; SM90-NEXT:    or.b32 %r15, %r16, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM90-NEXT:    @%p1 bra $L__BB21_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB21_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    and.b32 %r6, %r5, %r2;
+; SM90-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM90-NEXT:    mov.b32 %r16, %r6;
 ; SM90-NEXT:    @%p2 bra $L__BB21_1;
 ; SM90-NEXT:  $L__BB21_3: // %partword.cmpxchg.end
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic
     ret i16 %new
@@ -1022,42 +1022,42 @@ define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<17>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
 ; SM90-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_global_cta_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_global_cta_param_0];
 ; SM90-NEXT:    fence.release.cta;
-; SM90-NEXT:    ld.param.b16 %r9, [release_acquire_i16_global_cta_param_1];
+; SM90-NEXT:    ld.param.b16 %r7, [release_acquire_i16_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 65535;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
+; SM90-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM90-NEXT:  $L__BB22_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r14, %r16, %r3;
+; SM90-NEXT:    or.b32 %r15, %r16, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM90-NEXT:    @%p1 bra $L__BB22_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB22_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    and.b32 %r6, %r5, %r2;
+; SM90-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM90-NEXT:    mov.b32 %r16, %r6;
 ; SM90-NEXT:    @%p2 bra $L__BB22_1;
 ; SM90-NEXT:  $L__BB22_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release acquire
     ret i16 %new
@@ -1068,42 +1068,42 @@ define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<17>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
 ; SM90-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_global_cta_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_global_cta_param_0];
 ; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_global_cta_param_1];
+; SM90-NEXT:    ld.param.b16 %r7, [release_seq_cst_i16_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 65535;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
+; SM90-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM90-NEXT:  $L__BB23_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r14, %r16, %r3;
+; SM90-NEXT:    or.b32 %r15, %r16, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM90-NEXT:    @%p1 bra $L__BB23_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB23_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    and.b32 %r6, %r5, %r2;
+; SM90-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM90-NEXT:    mov.b32 %r16, %r6;
 ; SM90-NEXT:    @%p2 bra $L__BB23_1;
 ; SM90-NEXT:  $L__BB23_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst
     ret i16 %new
@@ -1114,42 +1114,42 @@ define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<17>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
 ; SM90-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0];
 ; SM90-NEXT:    fence.release.cta;
-; SM90-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_global_cta_param_1];
+; SM90-NEXT:    ld.param.b16 %r7, [acq_rel_monotonic_i16_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 65535;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
+; SM90-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM90-NEXT:  $L__BB24_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r14, %r16, %r3;
+; SM90-NEXT:    or.b32 %r15, %r16, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM90-NEXT:    @%p1 bra $L__BB24_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB24_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    and.b32 %r6, %r5, %r2;
+; SM90-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM90-NEXT:    mov.b32 %r16, %r6;
 ; SM90-NEXT:    @%p2 bra $L__BB24_1;
 ; SM90-NEXT:  $L__BB24_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic
     ret i16 %new
@@ -1160,42 +1160,42 @@ define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<17>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
 ; SM90-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_global_cta_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_global_cta_param_0];
 ; SM90-NEXT:    fence.release.cta;
-; SM90-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_global_cta_param_1];
+; SM90-NEXT:    ld.param.b16 %r7, [acq_rel_acquire_i16_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 65535;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
+; SM90-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM90-NEXT:  $L__BB25_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r14, %r16, %r3;
+; SM90-NEXT:    or.b32 %r15, %r16, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM90-NEXT:    @%p1 bra $L__BB25_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB25_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    and.b32 %r6, %r5, %r2;
+; SM90-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM90-NEXT:    mov.b32 %r16, %r6;
 ; SM90-NEXT:    @%p2 bra $L__BB25_1;
 ; SM90-NEXT:  $L__BB25_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire
     ret i16 %new
@@ -1206,42 +1206,42 @@ define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<17>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
 ; SM90-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0];
 ; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_cta_param_1];
+; SM90-NEXT:    ld.param.b16 %r7, [acq_rel_seq_cst_i16_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 65535;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
+; SM90-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM90-NEXT:  $L__BB26_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r14, %r16, %r3;
+; SM90-NEXT:    or.b32 %r15, %r16, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM90-NEXT:    @%p1 bra $L__BB26_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB26_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    and.b32 %r6, %r5, %r2;
+; SM90-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM90-NEXT:    mov.b32 %r16, %r6;
 ; SM90-NEXT:    @%p2 bra $L__BB26_1;
 ; SM90-NEXT:  $L__BB26_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst
     ret i16 %new
@@ -1252,42 +1252,42 @@ define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<17>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
 ; SM90-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0];
 ; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_global_cta_param_1];
+; SM90-NEXT:    ld.param.b16 %r7, [seq_cst_monotonic_i16_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 65535;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
+; SM90-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM90-NEXT:  $L__BB27_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r14, %r16, %r3;
+; SM90-NEXT:    or.b32 %r15, %r16, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM90-NEXT:    @%p1 bra $L__BB27_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB27_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    and.b32 %r6, %r5, %r2;
+; SM90-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM90-NEXT:    mov.b32 %r16, %r6;
 ; SM90-NEXT:    @%p2 bra $L__BB27_1;
 ; SM90-NEXT:  $L__BB27_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic
     ret i16 %new
@@ -1298,42 +1298,42 @@ define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<17>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
 ; SM90-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_global_cta_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_global_cta_param_0];
 ; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_global_cta_param_1];
+; SM90-NEXT:    ld.param.b16 %r7, [seq_cst_acquire_i16_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 65535;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
+; SM90-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM90-NEXT:  $L__BB28_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r14, %r16, %r3;
+; SM90-NEXT:    or.b32 %r15, %r16, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM90-NEXT:    @%p1 bra $L__BB28_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB28_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    and.b32 %r6, %r5, %r2;
+; SM90-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM90-NEXT:    mov.b32 %r16, %r6;
 ; SM90-NEXT:    @%p2 bra $L__BB28_1;
 ; SM90-NEXT:  $L__BB28_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire
     ret i16 %new
@@ -1344,42 +1344,42 @@ define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b32 %r<17>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
 ; SM90-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0];
 ; SM90-NEXT:    fence.sc.cta;
-; SM90-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_cta_param_1];
+; SM90-NEXT:    ld.param.b16 %r7, [seq_cst_seq_cst_i16_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 65535;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM90-NEXT:    ld.global.b32 %r13, [%rd1];
+; SM90-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM90-NEXT:  $L__BB29_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM90-NEXT:    or.b32 %r14, %r16, %r3;
+; SM90-NEXT:    or.b32 %r15, %r16, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM90-NEXT:    @%p1 bra $L__BB29_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB29_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
+; SM90-NEXT:    and.b32 %r6, %r5, %r2;
+; SM90-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM90-NEXT:    mov.b32 %r16, %r6;
 ; SM90-NEXT:    @%p2 bra $L__BB29_1;
 ; SM90-NEXT:  $L__BB29_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst
     ret i16 %new
@@ -1899,43 +1899,43 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b32 %r<18>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
 ; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0];
 ; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.b8 %r9, [acq_rel_acquire_i8_global_param_1];
+; SM90-NEXT:    ld.param.b8 %r7, [acq_rel_acquire_i8_global_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 255;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    and.b32 %r15, %r14, 255;
-; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 255;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    and.b32 %r13, %r12, 255;
+; SM90-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM90-NEXT:    ld.global.b32 %r14, [%rd1];
+; SM90-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM90-NEXT:  $L__BB60_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r17, %r20, %r3;
-; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM90-NEXT:    or.b32 %r15, %r17, %r3;
+; SM90-NEXT:    or.b32 %r16, %r17, %r4;
+; SM90-NEXT:    atom.relaxed.sys.global.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM90-NEXT:    @%p1 bra $L__BB60_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB60_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    and.b32 %r6, %r5, %r2;
+; SM90-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM90-NEXT:    mov.b32 %r17, %r6;
 ; SM90-NEXT:    @%p2 bra $L__BB60_1;
 ; SM90-NEXT:  $L__BB60_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire
     ret i8 %new
@@ -2014,43 +2014,43 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b32 %r<18>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
 ; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0];
 ; SM90-NEXT:    fence.release.cta;
-; SM90-NEXT:    ld.param.b8 %r9, [acq_rel_acquire_i8_generic_cta_param_1];
+; SM90-NEXT:    ld.param.b8 %r7, [acq_rel_acquire_i8_generic_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 255;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    and.b32 %r15, %r14, 255;
-; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.b32 %r16, [%rd1];
-; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 255;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    and.b32 %r13, %r12, 255;
+; SM90-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM90-NEXT:    ld.b32 %r14, [%rd1];
+; SM90-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM90-NEXT:  $L__BB65_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r17, %r20, %r3;
-; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM90-NEXT:    or.b32 %r15, %r17, %r3;
+; SM90-NEXT:    or.b32 %r16, %r17, %r4;
+; SM90-NEXT:    atom.relaxed.cta.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM90-NEXT:    @%p1 bra $L__BB65_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB65_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    and.b32 %r6, %r5, %r2;
+; SM90-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM90-NEXT:    mov.b32 %r17, %r6;
 ; SM90-NEXT:    @%p2 bra $L__BB65_1;
 ; SM90-NEXT:  $L__BB65_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
     ret i8 %new
@@ -2061,43 +2061,43 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b32 %r<18>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
 ; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0];
 ; SM90-NEXT:    fence.release.cta;
-; SM90-NEXT:    ld.param.b8 %r9, [acq_rel_acquire_i8_shared_cta_param_1];
+; SM90-NEXT:    ld.param.b8 %r7, [acq_rel_acquire_i8_shared_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 255;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    and.b32 %r15, %r14, 255;
-; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
-; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM90-NEXT:    and.b32 %r9, %r8, 3;
+; SM90-NEXT:    shl.b32 %r1, %r9, 3;
+; SM90-NEXT:    mov.b32 %r10, 255;
+; SM90-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM90-NEXT:    not.b32 %r2, %r11;
+; SM90-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM90-NEXT:    and.b32 %r13, %r12, 255;
+; SM90-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM90-NEXT:    ld.shared.b32 %r14, [%rd1];
+; SM90-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM90-NEXT:  $L__BB66_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r17, %r20, %r3;
-; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM90-NEXT:    or.b32 %r15, %r17, %r3;
+; SM90-NEXT:    or.b32 %r16, %r17, %r4;
+; SM90-NEXT:    atom.relaxed.cta.shared.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM90-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM90-NEXT:    @%p1 bra $L__BB66_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB66_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    and.b32 %r6, %r5, %r2;
+; SM90-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM90-NEXT:    mov.b32 %r17, %r6;
 ; SM90-NEXT:    @%p2 bra $L__BB66_1;
 ; SM90-NEXT:  $L__BB66_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.cta;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
     ret i8 %new
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.ll b/llvm/test/CodeGen/NVPTX/cmpxchg.ll
index 237e42394ba2f..997df7a8ad8b8 100644
--- a/llvm/test/CodeGen/NVPTX/cmpxchg.ll
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg.ll
@@ -14,82 +14,82 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM30:       {
 ; SM30-NEXT:    .reg .pred %p<3>;
 ; SM30-NEXT:    .reg .b16 %rs<2>;
-; SM30-NEXT:    .reg .b32 %r<21>;
+; SM30-NEXT:    .reg .b32 %r<18>;
 ; SM30-NEXT:    .reg .b64 %rd<3>;
 ; SM30-EMPTY:
 ; SM30-NEXT:  // %bb.0:
 ; SM30-NEXT:    ld.param.b8 %rs1, [relaxed_sys_i8_param_2];
 ; SM30-NEXT:    ld.param.b64 %rd2, [relaxed_sys_i8_param_0];
 ; SM30-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM30-NEXT:    ld.param.b8 %r9, [relaxed_sys_i8_param_1];
-; SM30-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM30-NEXT:    and.b32 %r11, %r10, 3;
-; SM30-NEXT:    shl.b32 %r1, %r11, 3;
-; SM30-NEXT:    mov.b32 %r12, 255;
-; SM30-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM30-NEXT:    not.b32 %r2, %r13;
-; SM30-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM30-NEXT:    and.b32 %r15, %r14, 255;
-; SM30-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM30-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM30-NEXT:    ld.b32 %r16, [%rd1];
-; SM30-NEXT:    and.b32 %r20, %r16, %r2;
+; SM30-NEXT:    ld.param.b8 %r7, [relaxed_sys_i8_param_1];
+; SM30-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM30-NEXT:    and.b32 %r9, %r8, 3;
+; SM30-NEXT:    shl.b32 %r1, %r9, 3;
+; SM30-NEXT:    mov.b32 %r10, 255;
+; SM30-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM30-NEXT:    not.b32 %r2, %r11;
+; SM30-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM30-NEXT:    and.b32 %r13, %r12, 255;
+; SM30-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM30-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM30-NEXT:    ld.b32 %r14, [%rd1];
+; SM30-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM30-NEXT:  $L__BB0_1: // %partword.cmpxchg.loop
 ; SM30-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM30-NEXT:    or.b32 %r17, %r20, %r3;
-; SM30-NEXT:    or.b32 %r18, %r20, %r4;
-; SM30-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM30-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM30-NEXT:    or.b32 %r15, %r17, %r3;
+; SM30-NEXT:    or.b32 %r16, %r17, %r4;
+; SM30-NEXT:    atom.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM30-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM30-NEXT:    @%p1 bra $L__BB0_3;
 ; SM30-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM30-NEXT:    // in Loop: Header=BB0_1 Depth=1
-; SM30-NEXT:    and.b32 %r8, %r7, %r2;
-; SM30-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM30-NEXT:    mov.b32 %r20, %r8;
+; SM30-NEXT:    and.b32 %r6, %r5, %r2;
+; SM30-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM30-NEXT:    mov.b32 %r17, %r6;
 ; SM30-NEXT:    @%p2 bra $L__BB0_1;
 ; SM30-NEXT:  $L__BB0_3: // %partword.cmpxchg.end
-; SM30-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM30-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM30-NEXT:    ret;
 ;
 ; SM70-LABEL: relaxed_sys_i8(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b32 %r<18>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b8 %rs1, [relaxed_sys_i8_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [relaxed_sys_i8_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r9, [relaxed_sys_i8_param_1];
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:    ld.param.b8 %r7, [relaxed_sys_i8_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 255;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    and.b32 %r13, %r12, 255;
+; SM70-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM70-NEXT:    ld.b32 %r14, [%rd1];
+; SM70-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM70-NEXT:  $L__BB0_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM70-NEXT:    or.b32 %r15, %r17, %r3;
+; SM70-NEXT:    or.b32 %r16, %r17, %r4;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM70-NEXT:    @%p1 bra $L__BB0_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB0_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    and.b32 %r6, %r5, %r2;
+; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM70-NEXT:    mov.b32 %r17, %r6;
 ; SM70-NEXT:    @%p2 bra $L__BB0_1;
 ; SM70-NEXT:  $L__BB0_3: // %partword.cmpxchg.end
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
 ; SM90-LABEL: relaxed_sys_i8(
 ; SM90:       {
@@ -140,84 +140,84 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM30:       {
 ; SM30-NEXT:    .reg .pred %p<3>;
 ; SM30-NEXT:    .reg .b16 %rs<2>;
-; SM30-NEXT:    .reg .b32 %r<21>;
+; SM30-NEXT:    .reg .b32 %r<18>;
 ; SM30-NEXT:    .reg .b64 %rd<3>;
 ; SM30-EMPTY:
 ; SM30-NEXT:  // %bb.0:
 ; SM30-NEXT:    ld.param.b8 %rs1, [acquire_sys_i8_param_2];
 ; SM30-NEXT:    ld.param.b64 %rd2, [acquire_sys_i8_param_0];
 ; SM30-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM30-NEXT:    ld.param.b8 %r9, [acquire_sys_i8_param_1];
-; SM30-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM30-NEXT:    and.b32 %r11, %r10, 3;
-; SM30-NEXT:    shl.b32 %r1, %r11, 3;
-; SM30-NEXT:    mov.b32 %r12, 255;
-; SM30-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM30-NEXT:    not.b32 %r2, %r13;
-; SM30-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM30-NEXT:    and.b32 %r15, %r14, 255;
-; SM30-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM30-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM30-NEXT:    ld.b32 %r16, [%rd1];
-; SM30-NEXT:    and.b32 %r20, %r16, %r2;
+; SM30-NEXT:    ld.param.b8 %r7, [acquire_sys_i8_param_1];
+; SM30-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM30-NEXT:    and.b32 %r9, %r8, 3;
+; SM30-NEXT:    shl.b32 %r1, %r9, 3;
+; SM30-NEXT:    mov.b32 %r10, 255;
+; SM30-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM30-NEXT:    not.b32 %r2, %r11;
+; SM30-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM30-NEXT:    and.b32 %r13, %r12, 255;
+; SM30-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM30-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM30-NEXT:    ld.b32 %r14, [%rd1];
+; SM30-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM30-NEXT:  $L__BB1_1: // %partword.cmpxchg.loop
 ; SM30-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM30-NEXT:    or.b32 %r17, %r20, %r3;
-; SM30-NEXT:    or.b32 %r18, %r20, %r4;
-; SM30-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM30-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM30-NEXT:    or.b32 %r15, %r17, %r3;
+; SM30-NEXT:    or.b32 %r16, %r17, %r4;
+; SM30-NEXT:    atom.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM30-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM30-NEXT:    @%p1 bra $L__BB1_3;
 ; SM30-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM30-NEXT:    // in Loop: Header=BB1_1 Depth=1
-; SM30-NEXT:    and.b32 %r8, %r7, %r2;
-; SM30-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM30-NEXT:    mov.b32 %r20, %r8;
+; SM30-NEXT:    and.b32 %r6, %r5, %r2;
+; SM30-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM30-NEXT:    mov.b32 %r17, %r6;
 ; SM30-NEXT:    @%p2 bra $L__BB1_1;
 ; SM30-NEXT:  $L__BB1_3: // %partword.cmpxchg.end
 ; SM30-NEXT:    membar.sys;
-; SM30-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM30-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM30-NEXT:    ret;
 ;
 ; SM70-LABEL: acquire_sys_i8(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b32 %r<18>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b8 %rs1, [acquire_sys_i8_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [acquire_sys_i8_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r9, [acquire_sys_i8_param_1];
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:    ld.param.b8 %r7, [acquire_sys_i8_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 255;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    and.b32 %r13, %r12, 255;
+; SM70-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM70-NEXT:    ld.b32 %r14, [%rd1];
+; SM70-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM70-NEXT:  $L__BB1_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM70-NEXT:    or.b32 %r15, %r17, %r3;
+; SM70-NEXT:    or.b32 %r16, %r17, %r4;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM70-NEXT:    @%p1 bra $L__BB1_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB1_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    and.b32 %r6, %r5, %r2;
+; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM70-NEXT:    mov.b32 %r17, %r6;
 ; SM70-NEXT:    @%p2 bra $L__BB1_1;
 ; SM70-NEXT:  $L__BB1_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
 ; SM90-LABEL: acquire_sys_i8(
 ; SM90:       {
@@ -269,84 +269,84 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM30:       {
 ; SM30-NEXT:    .reg .pred %p<3>;
 ; SM30-NEXT:    .reg .b16 %rs<2>;
-; SM30-NEXT:    .reg .b32 %r<21>;
+; SM30-NEXT:    .reg .b32 %r<18>;
 ; SM30-NEXT:    .reg .b64 %rd<3>;
 ; SM30-EMPTY:
 ; SM30-NEXT:  // %bb.0:
 ; SM30-NEXT:    ld.param.b8 %rs1, [release_sys_i8_param_2];
 ; SM30-NEXT:    ld.param.b64 %rd2, [release_sys_i8_param_0];
 ; SM30-NEXT:    membar.sys;
-; SM30-NEXT:    ld.param.b8 %r9, [release_sys_i8_param_1];
+; SM30-NEXT:    ld.param.b8 %r7, [release_sys_i8_param_1];
 ; SM30-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM30-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM30-NEXT:    and.b32 %r11, %r10, 3;
-; SM30-NEXT:    shl.b32 %r1, %r11, 3;
-; SM30-NEXT:    mov.b32 %r12, 255;
-; SM30-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM30-NEXT:    not.b32 %r2, %r13;
-; SM30-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM30-NEXT:    and.b32 %r15, %r14, 255;
-; SM30-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM30-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM30-NEXT:    ld.b32 %r16, [%rd1];
-; SM30-NEXT:    and.b32 %r20, %r16, %r2;
+; SM30-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM30-NEXT:    and.b32 %r9, %r8, 3;
+; SM30-NEXT:    shl.b32 %r1, %r9, 3;
+; SM30-NEXT:    mov.b32 %r10, 255;
+; SM30-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM30-NEXT:    not.b32 %r2, %r11;
+; SM30-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM30-NEXT:    and.b32 %r13, %r12, 255;
+; SM30-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM30-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM30-NEXT:    ld.b32 %r14, [%rd1];
+; SM30-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM30-NEXT:  $L__BB2_1: // %partword.cmpxchg.loop
 ; SM30-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM30-NEXT:    or.b32 %r17, %r20, %r3;
-; SM30-NEXT:    or.b32 %r18, %r20, %r4;
-; SM30-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM30-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM30-NEXT:    or.b32 %r15, %r17, %r3;
+; SM30-NEXT:    or.b32 %r16, %r17, %r4;
+; SM30-NEXT:    atom.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM30-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM30-NEXT:    @%p1 bra $L__BB2_3;
 ; SM30-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM30-NEXT:    // in Loop: Header=BB2_1 Depth=1
-; SM30-NEXT:    and.b32 %r8, %r7, %r2;
-; SM30-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM30-NEXT:    mov.b32 %r20, %r8;
+; SM30-NEXT:    and.b32 %r6, %r5, %r2;
+; SM30-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM30-NEXT:    mov.b32 %r17, %r6;
 ; SM30-NEXT:    @%p2 bra $L__BB2_1;
 ; SM30-NEXT:  $L__BB2_3: // %partword.cmpxchg.end
-; SM30-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM30-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM30-NEXT:    ret;
 ;
 ; SM70-LABEL: release_sys_i8(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b32 %r<18>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b8 %rs1, [release_sys_i8_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [release_sys_i8_param_0];
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.b8 %r9, [release_sys_i8_param_1];
+; SM70-NEXT:    ld.param.b8 %r7, [release_sys_i8_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 255;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    and.b32 %r13, %r12, 255;
+; SM70-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM70-NEXT:    ld.b32 %r14, [%rd1];
+; SM70-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM70-NEXT:  $L__BB2_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM70-NEXT:    or.b32 %r15, %r17, %r3;
+; SM70-NEXT:    or.b32 %r16, %r17, %r4;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM70-NEXT:    @%p1 bra $L__BB2_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB2_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    and.b32 %r6, %r5, %r2;
+; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM70-NEXT:    mov.b32 %r17, %r6;
 ; SM70-NEXT:    @%p2 bra $L__BB2_1;
 ; SM70-NEXT:  $L__BB2_3: // %partword.cmpxchg.end
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
 ; SM90-LABEL: release_sys_i8(
 ; SM90:       {
@@ -398,86 +398,86 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM30:       {
 ; SM30-NEXT:    .reg .pred %p<3>;
 ; SM30-NEXT:    .reg .b16 %rs<2>;
-; SM30-NEXT:    .reg .b32 %r<21>;
+; SM30-NEXT:    .reg .b32 %r<18>;
 ; SM30-NEXT:    .reg .b64 %rd<3>;
 ; SM30-EMPTY:
 ; SM30-NEXT:  // %bb.0:
 ; SM30-NEXT:    ld.param.b8 %rs1, [acq_rel_sys_i8_param_2];
 ; SM30-NEXT:    ld.param.b64 %rd2, [acq_rel_sys_i8_param_0];
 ; SM30-NEXT:    membar.sys;
-; SM30-NEXT:    ld.param.b8 %r9, [acq_rel_sys_i8_param_1];
+; SM30-NEXT:    ld.param.b8 %r7, [acq_rel_sys_i8_param_1];
 ; SM30-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM30-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM30-NEXT:    and.b32 %r11, %r10, 3;
-; SM30-NEXT:    shl.b32 %r1, %r11, 3;
-; SM30-NEXT:    mov.b32 %r12, 255;
-; SM30-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM30-NEXT:    not.b32 %r2, %r13;
-; SM30-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM30-NEXT:    and.b32 %r15, %r14, 255;
-; SM30-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM30-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM30-NEXT:    ld.b32 %r16, [%rd1];
-; SM30-NEXT:    and.b32 %r20, %r16, %r2;
+; SM30-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM30-NEXT:    and.b32 %r9, %r8, 3;
+; SM30-NEXT:    shl.b32 %r1, %r9, 3;
+; SM30-NEXT:    mov.b32 %r10, 255;
+; SM30-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM30-NEXT:    not.b32 %r2, %r11;
+; SM30-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM30-NEXT:    and.b32 %r13, %r12, 255;
+; SM30-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM30-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM30-NEXT:    ld.b32 %r14, [%rd1];
+; SM30-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM30-NEXT:  $L__BB3_1: // %partword.cmpxchg.loop
 ; SM30-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM30-NEXT:    or.b32 %r17, %r20, %r3;
-; SM30-NEXT:    or.b32 %r18, %r20, %r4;
-; SM30-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM30-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM30-NEXT:    or.b32 %r15, %r17, %r3;
+; SM30-NEXT:    or.b32 %r16, %r17, %r4;
+; SM30-NEXT:    atom.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM30-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM30-NEXT:    @%p1 bra $L__BB3_3;
 ; SM30-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM30-NEXT:    // in Loop: Header=BB3_1 Depth=1
-; SM30-NEXT:    and.b32 %r8, %r7, %r2;
-; SM30-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM30-NEXT:    mov.b32 %r20, %r8;
+; SM30-NEXT:    and.b32 %r6, %r5, %r2;
+; SM30-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM30-NEXT:    mov.b32 %r17, %r6;
 ; SM30-NEXT:    @%p2 bra $L__BB3_1;
 ; SM30-NEXT:  $L__BB3_3: // %partword.cmpxchg.end
 ; SM30-NEXT:    membar.sys;
-; SM30-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM30-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM30-NEXT:    ret;
 ;
 ; SM70-LABEL: acq_rel_sys_i8(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b32 %r<18>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_sys_i8_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_sys_i8_param_0];
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.b8 %r9, [acq_rel_sys_i8_param_1];
+; SM70-NEXT:    ld.param.b8 %r7, [acq_rel_sys_i8_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 255;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    and.b32 %r13, %r12, 255;
+; SM70-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM70-NEXT:    ld.b32 %r14, [%rd1];
+; SM70-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM70-NEXT:  $L__BB3_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM70-NEXT:    or.b32 %r15, %r17, %r3;
+; SM70-NEXT:    or.b32 %r16, %r17, %r4;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM70-NEXT:    @%p1 bra $L__BB3_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB3_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    and.b32 %r6, %r5, %r2;
+; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM70-NEXT:    mov.b32 %r17, %r6;
 ; SM70-NEXT:    @%p2 bra $L__BB3_1;
 ; SM70-NEXT:  $L__BB3_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
 ; SM90-LABEL: acq_rel_sys_i8(
 ; SM90:       {
@@ -530,86 +530,86 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM30:       {
 ; SM30-NEXT:    .reg .pred %p<3>;
 ; SM30-NEXT:    .reg .b16 %rs<2>;
-; SM30-NEXT:    .reg .b32 %r<21>;
+; SM30-NEXT:    .reg .b32 %r<18>;
 ; SM30-NEXT:    .reg .b64 %rd<3>;
 ; SM30-EMPTY:
 ; SM30-NEXT:  // %bb.0:
 ; SM30-NEXT:    ld.param.b8 %rs1, [seq_cst_sys_i8_param_2];
 ; SM30-NEXT:    ld.param.b64 %rd2, [seq_cst_sys_i8_param_0];
 ; SM30-NEXT:    membar.sys;
-; SM30-NEXT:    ld.param.b8 %r9, [seq_cst_sys_i8_param_1];
+; SM30-NEXT:    ld.param.b8 %r7, [seq_cst_sys_i8_param_1];
 ; SM30-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM30-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM30-NEXT:    and.b32 %r11, %r10, 3;
-; SM30-NEXT:    shl.b32 %r1, %r11, 3;
-; SM30-NEXT:    mov.b32 %r12, 255;
-; SM30-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM30-NEXT:    not.b32 %r2, %r13;
-; SM30-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM30-NEXT:    and.b32 %r15, %r14, 255;
-; SM30-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM30-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM30-NEXT:    ld.b32 %r16, [%rd1];
-; SM30-NEXT:    and.b32 %r20, %r16, %r2;
+; SM30-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM30-NEXT:    and.b32 %r9, %r8, 3;
+; SM30-NEXT:    shl.b32 %r1, %r9, 3;
+; SM30-NEXT:    mov.b32 %r10, 255;
+; SM30-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM30-NEXT:    not.b32 %r2, %r11;
+; SM30-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM30-NEXT:    and.b32 %r13, %r12, 255;
+; SM30-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM30-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM30-NEXT:    ld.b32 %r14, [%rd1];
+; SM30-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM30-NEXT:  $L__BB4_1: // %partword.cmpxchg.loop
 ; SM30-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM30-NEXT:    or.b32 %r17, %r20, %r3;
-; SM30-NEXT:    or.b32 %r18, %r20, %r4;
-; SM30-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM30-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM30-NEXT:    or.b32 %r15, %r17, %r3;
+; SM30-NEXT:    or.b32 %r16, %r17, %r4;
+; SM30-NEXT:    atom.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM30-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM30-NEXT:    @%p1 bra $L__BB4_3;
 ; SM30-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM30-NEXT:    // in Loop: Header=BB4_1 Depth=1
-; SM30-NEXT:    and.b32 %r8, %r7, %r2;
-; SM30-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM30-NEXT:    mov.b32 %r20, %r8;
+; SM30-NEXT:    and.b32 %r6, %r5, %r2;
+; SM30-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM30-NEXT:    mov.b32 %r17, %r6;
 ; SM30-NEXT:    @%p2 bra $L__BB4_1;
 ; SM30-NEXT:  $L__BB4_3: // %partword.cmpxchg.end
 ; SM30-NEXT:    membar.sys;
-; SM30-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM30-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM30-NEXT:    ret;
 ;
 ; SM70-LABEL: seq_cst_sys_i8(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b32 %r<18>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_sys_i8_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_sys_i8_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b8 %r9, [seq_cst_sys_i8_param_1];
+; SM70-NEXT:    ld.param.b8 %r7, [seq_cst_sys_i8_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 255;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    and.b32 %r13, %r12, 255;
+; SM70-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM70-NEXT:    ld.b32 %r14, [%rd1];
+; SM70-NEXT:    and.b32 %r17, %r14, %r2;
 ; SM70-NEXT:  $L__BB4_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM70-NEXT:    or.b32 %r15, %r17, %r3;
+; SM70-NEXT:    or.b32 %r16, %r17, %r4;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r5, [%rd1], %r16, %r15;
+; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r16;
 ; SM70-NEXT:    @%p1 bra $L__BB4_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB4_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    and.b32 %r6, %r5, %r2;
+; SM70-NEXT:    setp.ne.b32 %p2, %r17, %r6;
+; SM70-NEXT:    mov.b32 %r17, %r6;
 ; SM70-NEXT:    @%p2 bra $L__BB4_1;
 ; SM70-NEXT:  $L__BB4_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
 ; SM90-LABEL: seq_cst_sys_i8(
 ; SM90:       {
@@ -663,80 +663,80 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM30:       {
 ; SM30-NEXT:    .reg .pred %p<3>;
 ; SM30-NEXT:    .reg .b16 %rs<2>;
-; SM30-NEXT:    .reg .b32 %r<20>;
+; SM30-NEXT:    .reg .b32 %r<17>;
 ; SM30-NEXT:    .reg .b64 %rd<3>;
 ; SM30-EMPTY:
 ; SM30-NEXT:  // %bb.0:
 ; SM30-NEXT:    ld.param.b16 %rs1, [relaxed_sys_i16_param_2];
 ; SM30-NEXT:    ld.param.b64 %rd2, [relaxed_sys_i16_param_0];
 ; SM30-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM30-NEXT:    ld.param.b16 %r9, [relaxed_sys_i16_param_1];
-; SM30-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM30-NEXT:    and.b32 %r11, %r10, 3;
-; SM30-NEXT:    shl.b32 %r1, %r11, 3;
-; SM30-NEXT:    mov.b32 %r12, 65535;
-; SM30-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM30-NEXT:    not.b32 %r2, %r13;
-; SM30-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM30-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM30-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM30-NEXT:    ld.b32 %r15, [%rd1];
-; SM30-NEXT:    and.b32 %r19, %r15, %r2;
+; SM30-NEXT:    ld.param.b16 %r7, [relaxed_sys_i16_param_1];
+; SM30-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM30-NEXT:    and.b32 %r9, %r8, 3;
+; SM30-NEXT:    shl.b32 %r1, %r9, 3;
+; SM30-NEXT:    mov.b32 %r10, 65535;
+; SM30-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM30-NEXT:    not.b32 %r2, %r11;
+; SM30-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM30-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM30-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM30-NEXT:    ld.b32 %r13, [%rd1];
+; SM30-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM30-NEXT:  $L__BB5_1: // %partword.cmpxchg.loop
 ; SM30-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM30-NEXT:    or.b32 %r16, %r19, %r3;
-; SM30-NEXT:    or.b32 %r17, %r19, %r4;
-; SM30-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM30-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM30-NEXT:    or.b32 %r14, %r16, %r3;
+; SM30-NEXT:    or.b32 %r15, %r16, %r4;
+; SM30-NEXT:    atom.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM30-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM30-NEXT:    @%p1 bra $L__BB5_3;
 ; SM30-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM30-NEXT:    // in Loop: Header=BB5_1 Depth=1
-; SM30-NEXT:    and.b32 %r8, %r7, %r2;
-; SM30-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM30-NEXT:    mov.b32 %r19, %r8;
+; SM30-NEXT:    and.b32 %r6, %r5, %r2;
+; SM30-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM30-NEXT:    mov.b32 %r16, %r6;
 ; SM30-NEXT:    @%p2 bra $L__BB5_1;
 ; SM30-NEXT:  $L__BB5_3: // %partword.cmpxchg.end
-; SM30-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM30-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM30-NEXT:    ret;
 ;
 ; SM70-LABEL: relaxed_sys_i16(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b16 %rs1, [relaxed_sys_i16_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [relaxed_sys_i16_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r9, [relaxed_sys_i16_param_1];
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    ld.param.b16 %r7, [relaxed_sys_i16_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 65535;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM70-NEXT:    ld.b32 %r13, [%rd1];
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM70-NEXT:  $L__BB5_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM70-NEXT:    @%p1 bra $L__BB5_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB5_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    and.b32 %r6, %r5, %r2;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
 ; SM70-NEXT:    @%p2 bra $L__BB5_1;
 ; SM70-NEXT:  $L__BB5_3: // %partword.cmpxchg.end
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
 ; SM90-LABEL: relaxed_sys_i16(
 ; SM90:       {
@@ -786,82 +786,82 @@ define i16 @acquire_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM30:       {
 ; SM30-NEXT:    .reg .pred %p<3>;
 ; SM30-NEXT:    .reg .b16 %rs<2>;
-; SM30-NEXT:    .reg .b32 %r<20>;
+; SM30-NEXT:    .reg .b32 %r<17>;
 ; SM30-NEXT:    .reg .b64 %rd<3>;
 ; SM30-EMPTY:
 ; SM30-NEXT:  // %bb.0:
 ; SM30-NEXT:    ld.param.b16 %rs1, [acquire_sys_i16_param_2];
 ; SM30-NEXT:    ld.param.b64 %rd2, [acquire_sys_i16_param_0];
 ; SM30-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM30-NEXT:    ld.param.b16 %r9, [acquire_sys_i16_param_1];
-; SM30-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM30-NEXT:    and.b32 %r11, %r10, 3;
-; SM30-NEXT:    shl.b32 %r1, %r11, 3;
-; SM30-NEXT:    mov.b32 %r12, 65535;
-; SM30-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM30-NEXT:    not.b32 %r2, %r13;
-; SM30-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM30-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM30-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM30-NEXT:    ld.b32 %r15, [%rd1];
-; SM30-NEXT:    and.b32 %r19, %r15, %r2;
+; SM30-NEXT:    ld.param.b16 %r7, [acquire_sys_i16_param_1];
+; SM30-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM30-NEXT:    and.b32 %r9, %r8, 3;
+; SM30-NEXT:    shl.b32 %r1, %r9, 3;
+; SM30-NEXT:    mov.b32 %r10, 65535;
+; SM30-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM30-NEXT:    not.b32 %r2, %r11;
+; SM30-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM30-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM30-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM30-NEXT:    ld.b32 %r13, [%rd1];
+; SM30-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM30-NEXT:  $L__BB6_1: // %partword.cmpxchg.loop
 ; SM30-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM30-NEXT:    or.b32 %r16, %r19, %r3;
-; SM30-NEXT:    or.b32 %r17, %r19, %r4;
-; SM30-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM30-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM30-NEXT:    or.b32 %r14, %r16, %r3;
+; SM30-NEXT:    or.b32 %r15, %r16, %r4;
+; SM30-NEXT:    atom.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM30-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM30-NEXT:    @%p1 bra $L__BB6_3;
 ; SM30-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM30-NEXT:    // in Loop: Header=BB6_1 Depth=1
-; SM30-NEXT:    and.b32 %r8, %r7, %r2;
-; SM30-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM30-NEXT:    mov.b32 %r19, %r8;
+; SM30-NEXT:    and.b32 %r6, %r5, %r2;
+; SM30-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM30-NEXT:    mov.b32 %r16, %r6;
 ; SM30-NEXT:    @%p2 bra $L__BB6_1;
 ; SM30-NEXT:  $L__BB6_3: // %partword.cmpxchg.end
 ; SM30-NEXT:    membar.sys;
-; SM30-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM30-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM30-NEXT:    ret;
 ;
 ; SM70-LABEL: acquire_sys_i16(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b16 %rs1, [acquire_sys_i16_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [acquire_sys_i16_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r9, [acquire_sys_i16_param_1];
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    ld.param.b16 %r7, [acquire_sys_i16_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 65535;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM70-NEXT:    ld.b32 %r13, [%rd1];
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM70-NEXT:  $L__BB6_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM70-NEXT:    @%p1 bra $L__BB6_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB6_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    and.b32 %r6, %r5, %r2;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
 ; SM70-NEXT:    @%p2 bra $L__BB6_1;
 ; SM70-NEXT:  $L__BB6_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
 ; SM90-LABEL: acquire_sys_i16(
 ; SM90:       {
@@ -912,82 +912,82 @@ define i16 @release_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM30:       {
 ; SM30-NEXT:    .reg .pred %p<3>;
 ; SM30-NEXT:    .reg .b16 %rs<2>;
-; SM30-NEXT:    .reg .b32 %r<20>;
+; SM30-NEXT:    .reg .b32 %r<17>;
 ; SM30-NEXT:    .reg .b64 %rd<3>;
 ; SM30-EMPTY:
 ; SM30-NEXT:  // %bb.0:
 ; SM30-NEXT:    ld.param.b16 %rs1, [release_sys_i16_param_2];
 ; SM30-NEXT:    ld.param.b64 %rd2, [release_sys_i16_param_0];
 ; SM30-NEXT:    membar.sys;
-; SM30-NEXT:    ld.param.b16 %r9, [release_sys_i16_param_1];
+; SM30-NEXT:    ld.param.b16 %r7, [release_sys_i16_param_1];
 ; SM30-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM30-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM30-NEXT:    and.b32 %r11, %r10, 3;
-; SM30-NEXT:    shl.b32 %r1, %r11, 3;
-; SM30-NEXT:    mov.b32 %r12, 65535;
-; SM30-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM30-NEXT:    not.b32 %r2, %r13;
-; SM30-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM30-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM30-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM30-NEXT:    ld.b32 %r15, [%rd1];
-; SM30-NEXT:    and.b32 %r19, %r15, %r2;
+; SM30-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM30-NEXT:    and.b32 %r9, %r8, 3;
+; SM30-NEXT:    shl.b32 %r1, %r9, 3;
+; SM30-NEXT:    mov.b32 %r10, 65535;
+; SM30-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM30-NEXT:    not.b32 %r2, %r11;
+; SM30-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM30-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM30-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM30-NEXT:    ld.b32 %r13, [%rd1];
+; SM30-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM30-NEXT:  $L__BB7_1: // %partword.cmpxchg.loop
 ; SM30-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM30-NEXT:    or.b32 %r16, %r19, %r3;
-; SM30-NEXT:    or.b32 %r17, %r19, %r4;
-; SM30-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM30-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM30-NEXT:    or.b32 %r14, %r16, %r3;
+; SM30-NEXT:    or.b32 %r15, %r16, %r4;
+; SM30-NEXT:    atom.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM30-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM30-NEXT:    @%p1 bra $L__BB7_3;
 ; SM30-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM30-NEXT:    // in Loop: Header=BB7_1 Depth=1
-; SM30-NEXT:    and.b32 %r8, %r7, %r2;
-; SM30-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM30-NEXT:    mov.b32 %r19, %r8;
+; SM30-NEXT:    and.b32 %r6, %r5, %r2;
+; SM30-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM30-NEXT:    mov.b32 %r16, %r6;
 ; SM30-NEXT:    @%p2 bra $L__BB7_1;
 ; SM30-NEXT:  $L__BB7_3: // %partword.cmpxchg.end
-; SM30-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM30-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM30-NEXT:    ret;
 ;
 ; SM70-LABEL: release_sys_i16(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b16 %rs1, [release_sys_i16_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [release_sys_i16_param_0];
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [release_sys_i16_param_1];
+; SM70-NEXT:    ld.param.b16 %r7, [release_sys_i16_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 65535;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM70-NEXT:    ld.b32 %r13, [%rd1];
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM70-NEXT:  $L__BB7_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM70-NEXT:    @%p1 bra $L__BB7_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB7_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    and.b32 %r6, %r5, %r2;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
 ; SM70-NEXT:    @%p2 bra $L__BB7_1;
 ; SM70-NEXT:  $L__BB7_3: // %partword.cmpxchg.end
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
 ; SM90-LABEL: release_sys_i16(
 ; SM90:       {
@@ -1038,84 +1038,84 @@ define i16 @acq_rel_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM30:       {
 ; SM30-NEXT:    .reg .pred %p<3>;
 ; SM30-NEXT:    .reg .b16 %rs<2>;
-; SM30-NEXT:    .reg .b32 %r<20>;
+; SM30-NEXT:    .reg .b32 %r<17>;
 ; SM30-NEXT:    .reg .b64 %rd<3>;
 ; SM30-EMPTY:
 ; SM30-NEXT:  // %bb.0:
 ; SM30-NEXT:    ld.param.b16 %rs1, [acq_rel_sys_i16_param_2];
 ; SM30-NEXT:    ld.param.b64 %rd2, [acq_rel_sys_i16_param_0];
 ; SM30-NEXT:    membar.sys;
-; SM30-NEXT:    ld.param.b16 %r9, [acq_rel_sys_i16_param_1];
+; SM30-NEXT:    ld.param.b16 %r7, [acq_rel_sys_i16_param_1];
 ; SM30-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM30-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM30-NEXT:    and.b32 %r11, %r10, 3;
-; SM30-NEXT:    shl.b32 %r1, %r11, 3;
-; SM30-NEXT:    mov.b32 %r12, 65535;
-; SM30-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM30-NEXT:    not.b32 %r2, %r13;
-; SM30-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM30-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM30-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM30-NEXT:    ld.b32 %r15, [%rd1];
-; SM30-NEXT:    and.b32 %r19, %r15, %r2;
+; SM30-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM30-NEXT:    and.b32 %r9, %r8, 3;
+; SM30-NEXT:    shl.b32 %r1, %r9, 3;
+; SM30-NEXT:    mov.b32 %r10, 65535;
+; SM30-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM30-NEXT:    not.b32 %r2, %r11;
+; SM30-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM30-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM30-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM30-NEXT:    ld.b32 %r13, [%rd1];
+; SM30-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM30-NEXT:  $L__BB8_1: // %partword.cmpxchg.loop
 ; SM30-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM30-NEXT:    or.b32 %r16, %r19, %r3;
-; SM30-NEXT:    or.b32 %r17, %r19, %r4;
-; SM30-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM30-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM30-NEXT:    or.b32 %r14, %r16, %r3;
+; SM30-NEXT:    or.b32 %r15, %r16, %r4;
+; SM30-NEXT:    atom.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM30-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM30-NEXT:    @%p1 bra $L__BB8_3;
 ; SM30-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM30-NEXT:    // in Loop: Header=BB8_1 Depth=1
-; SM30-NEXT:    and.b32 %r8, %r7, %r2;
-; SM30-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM30-NEXT:    mov.b32 %r19, %r8;
+; SM30-NEXT:    and.b32 %r6, %r5, %r2;
+; SM30-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM30-NEXT:    mov.b32 %r16, %r6;
 ; SM30-NEXT:    @%p2 bra $L__BB8_1;
 ; SM30-NEXT:  $L__BB8_3: // %partword.cmpxchg.end
 ; SM30-NEXT:    membar.sys;
-; SM30-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM30-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM30-NEXT:    ret;
 ;
 ; SM70-LABEL: acq_rel_sys_i16(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_sys_i16_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_sys_i16_param_0];
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_sys_i16_param_1];
+; SM70-NEXT:    ld.param.b16 %r7, [acq_rel_sys_i16_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 65535;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM70-NEXT:    ld.b32 %r13, [%rd1];
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM70-NEXT:  $L__BB8_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM70-NEXT:    @%p1 bra $L__BB8_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB8_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    and.b32 %r6, %r5, %r2;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
 ; SM70-NEXT:    @%p2 bra $L__BB8_1;
 ; SM70-NEXT:  $L__BB8_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
 ; SM90-LABEL: acq_rel_sys_i16(
 ; SM90:       {
@@ -1168,84 +1168,84 @@ define i16 @seq_cst_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM30:       {
 ; SM30-NEXT:    .reg .pred %p<3>;
 ; SM30-NEXT:    .reg .b16 %rs<2>;
-; SM30-NEXT:    .reg .b32 %r<20>;
+; SM30-NEXT:    .reg .b32 %r<17>;
 ; SM30-NEXT:    .reg .b64 %rd<3>;
 ; SM30-EMPTY:
 ; SM30-NEXT:  // %bb.0:
 ; SM30-NEXT:    ld.param.b16 %rs1, [seq_cst_sys_i16_param_2];
 ; SM30-NEXT:    ld.param.b64 %rd2, [seq_cst_sys_i16_param_0];
 ; SM30-NEXT:    membar.sys;
-; SM30-NEXT:    ld.param.b16 %r9, [seq_cst_sys_i16_param_1];
+; SM30-NEXT:    ld.param.b16 %r7, [seq_cst_sys_i16_param_1];
 ; SM30-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM30-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM30-NEXT:    and.b32 %r11, %r10, 3;
-; SM30-NEXT:    shl.b32 %r1, %r11, 3;
-; SM30-NEXT:    mov.b32 %r12, 65535;
-; SM30-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM30-NEXT:    not.b32 %r2, %r13;
-; SM30-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM30-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM30-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM30-NEXT:    ld.b32 %r15, [%rd1];
-; SM30-NEXT:    and.b32 %r19, %r15, %r2;
+; SM30-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM30-NEXT:    and.b32 %r9, %r8, 3;
+; SM30-NEXT:    shl.b32 %r1, %r9, 3;
+; SM30-NEXT:    mov.b32 %r10, 65535;
+; SM30-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM30-NEXT:    not.b32 %r2, %r11;
+; SM30-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM30-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM30-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM30-NEXT:    ld.b32 %r13, [%rd1];
+; SM30-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM30-NEXT:  $L__BB9_1: // %partword.cmpxchg.loop
 ; SM30-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM30-NEXT:    or.b32 %r16, %r19, %r3;
-; SM30-NEXT:    or.b32 %r17, %r19, %r4;
-; SM30-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM30-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM30-NEXT:    or.b32 %r14, %r16, %r3;
+; SM30-NEXT:    or.b32 %r15, %r16, %r4;
+; SM30-NEXT:    atom.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM30-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM30-NEXT:    @%p1 bra $L__BB9_3;
 ; SM30-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM30-NEXT:    // in Loop: Header=BB9_1 Depth=1
-; SM30-NEXT:    and.b32 %r8, %r7, %r2;
-; SM30-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM30-NEXT:    mov.b32 %r19, %r8;
+; SM30-NEXT:    and.b32 %r6, %r5, %r2;
+; SM30-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM30-NEXT:    mov.b32 %r16, %r6;
 ; SM30-NEXT:    @%p2 bra $L__BB9_1;
 ; SM30-NEXT:  $L__BB9_3: // %partword.cmpxchg.end
 ; SM30-NEXT:    membar.sys;
-; SM30-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM30-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM30-NEXT:    ret;
 ;
 ; SM70-LABEL: seq_cst_sys_i16(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_sys_i16_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_sys_i16_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_sys_i16_param_1];
+; SM70-NEXT:    ld.param.b16 %r7, [seq_cst_sys_i16_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:    cvt.u32.u64 %r8, %rd2;
+; SM70-NEXT:    and.b32 %r9, %r8, 3;
+; SM70-NEXT:    shl.b32 %r1, %r9, 3;
+; SM70-NEXT:    mov.b32 %r10, 65535;
+; SM70-NEXT:    shl.b32 %r11, %r10, %r1;
+; SM70-NEXT:    not.b32 %r2, %r11;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r12, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r7, %r1;
+; SM70-NEXT:    ld.b32 %r13, [%rd1];
+; SM70-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM70-NEXT:  $L__BB9_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
+; SM70-NEXT:    or.b32 %r14, %r16, %r3;
+; SM70-NEXT:    or.b32 %r15, %r16, %r4;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM70-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM70-NEXT:    @%p1 bra $L__BB9_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB9_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
+; SM70-NEXT:    and.b32 %r6, %r5, %r2;
+; SM70-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM70-NEXT:    mov.b32 %r16, %r6;
 ; SM70-NEXT:    @%p2 bra $L__BB9_1;
 ; SM70-NEXT:  $L__BB9_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r12;
 ; SM70-NEXT:    ret;
 ; SM90-LABEL: seq_cst_sys_i16(
 ; SM90:       {
diff --git a/llvm/test/CodeGen/NVPTX/combine-mad.ll b/llvm/test/CodeGen/NVPTX/combine-mad.ll
index da303b7c38eb7..e6bce8991a71d 100644
--- a/llvm/test/CodeGen/NVPTX/combine-mad.ll
+++ b/llvm/test/CodeGen/NVPTX/combine-mad.ll
@@ -189,7 +189,7 @@ declare i32 @use(i32 %0, i32 %1)
 define i32 @test_mad_multi_use(i32 %a, i32 %b, i32 %c) {
 ; CHECK-LABEL: test_mad_multi_use(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<8>;
+; CHECK-NEXT:    .reg .b32 %r<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_mad_multi_use_param_0];
diff --git a/llvm/test/CodeGen/NVPTX/convert-call-to-indirect.ll b/llvm/test/CodeGen/NVPTX/convert-call-to-indirect.ll
index dd3e4ecddcd2e..6c80055ef4673 100644
--- a/llvm/test/CodeGen/NVPTX/convert-call-to-indirect.ll
+++ b/llvm/test/CodeGen/NVPTX/convert-call-to-indirect.ll
@@ -9,7 +9,7 @@ declare i64 @callee_variadic(ptr %p, ...);
 define %struct.64 @test_return_type_mismatch(ptr %p) {
 ; CHECK-LABEL: test_return_type_mismatch(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<40>;
+; CHECK-NEXT:    .reg .b64 %rd<32>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [test_return_type_mismatch_param_0];
@@ -29,35 +29,35 @@ define %struct.64 @test_return_type_mismatch(ptr %p) {
 ; CHECK-NEXT:    ld.param.b8 %rd9, [retval0+1];
 ; CHECK-NEXT:    ld.param.b8 %rd10, [retval0];
 ; CHECK-NEXT:    } // callseq 0
-; CHECK-NEXT:    shl.b64 %rd13, %rd9, 8;
-; CHECK-NEXT:    or.b64 %rd14, %rd13, %rd10;
-; CHECK-NEXT:    shl.b64 %rd16, %rd8, 16;
-; CHECK-NEXT:    shl.b64 %rd18, %rd7, 24;
-; CHECK-NEXT:    or.b64 %rd19, %rd18, %rd16;
-; CHECK-NEXT:    or.b64 %rd20, %rd19, %rd14;
-; CHECK-NEXT:    shl.b64 %rd23, %rd5, 8;
-; CHECK-NEXT:    or.b64 %rd24, %rd23, %rd6;
-; CHECK-NEXT:    shl.b64 %rd26, %rd4, 16;
-; CHECK-NEXT:    shl.b64 %rd28, %rd3, 24;
-; CHECK-NEXT:    or.b64 %rd29, %rd28, %rd26;
-; CHECK-NEXT:    or.b64 %rd30, %rd29, %rd24;
-; CHECK-NEXT:    shl.b64 %rd31, %rd30, 32;
-; CHECK-NEXT:    or.b64 %rd32, %rd31, %rd20;
+; CHECK-NEXT:    shl.b64 %rd11, %rd9, 8;
+; CHECK-NEXT:    or.b64 %rd12, %rd11, %rd10;
+; CHECK-NEXT:    shl.b64 %rd13, %rd8, 16;
+; CHECK-NEXT:    shl.b64 %rd14, %rd7, 24;
+; CHECK-NEXT:    or.b64 %rd15, %rd14, %rd13;
+; CHECK-NEXT:    or.b64 %rd16, %rd15, %rd12;
+; CHECK-NEXT:    shl.b64 %rd17, %rd5, 8;
+; CHECK-NEXT:    or.b64 %rd18, %rd17, %rd6;
+; CHECK-NEXT:    shl.b64 %rd19, %rd4, 16;
+; CHECK-NEXT:    shl.b64 %rd20, %rd3, 24;
+; CHECK-NEXT:    or.b64 %rd21, %rd20, %rd19;
+; CHECK-NEXT:    or.b64 %rd22, %rd21, %rd18;
+; CHECK-NEXT:    shl.b64 %rd23, %rd22, 32;
+; CHECK-NEXT:    or.b64 %rd24, %rd23, %rd16;
 ; CHECK-NEXT:    st.param.b8 [func_retval0], %rd10;
-; CHECK-NEXT:    shr.u64 %rd33, %rd32, 56;
-; CHECK-NEXT:    st.param.b8 [func_retval0+7], %rd33;
-; CHECK-NEXT:    shr.u64 %rd34, %rd32, 48;
-; CHECK-NEXT:    st.param.b8 [func_retval0+6], %rd34;
-; CHECK-NEXT:    shr.u64 %rd35, %rd32, 40;
-; CHECK-NEXT:    st.param.b8 [func_retval0+5], %rd35;
-; CHECK-NEXT:    shr.u64 %rd36, %rd32, 32;
-; CHECK-NEXT:    st.param.b8 [func_retval0+4], %rd36;
-; CHECK-NEXT:    shr.u64 %rd37, %rd32, 24;
-; CHECK-NEXT:    st.param.b8 [func_retval0+3], %rd37;
-; CHECK-NEXT:    shr.u64 %rd38, %rd32, 16;
-; CHECK-NEXT:    st.param.b8 [func_retval0+2], %rd38;
-; CHECK-NEXT:    shr.u64 %rd39, %rd32, 8;
-; CHECK-NEXT:    st.param.b8 [func_retval0+1], %rd39;
+; CHECK-NEXT:    shr.u64 %rd25, %rd24, 56;
+; CHECK-NEXT:    st.param.b8 [func_retval0+7], %rd25;
+; CHECK-NEXT:    shr.u64 %rd26, %rd24, 48;
+; CHECK-NEXT:    st.param.b8 [func_retval0+6], %rd26;
+; CHECK-NEXT:    shr.u64 %rd27, %rd24, 40;
+; CHECK-NEXT:    st.param.b8 [func_retval0+5], %rd27;
+; CHECK-NEXT:    shr.u64 %rd28, %rd24, 32;
+; CHECK-NEXT:    st.param.b8 [func_retval0+4], %rd28;
+; CHECK-NEXT:    shr.u64 %rd29, %rd24, 24;
+; CHECK-NEXT:    st.param.b8 [func_retval0+3], %rd29;
+; CHECK-NEXT:    shr.u64 %rd30, %rd24, 16;
+; CHECK-NEXT:    st.param.b8 [func_retval0+2], %rd30;
+; CHECK-NEXT:    shr.u64 %rd31, %rd24, 8;
+; CHECK-NEXT:    st.param.b8 [func_retval0+1], %rd31;
 ; CHECK-NEXT:    ret;
   %ret = call %struct.64 @callee(ptr %p)
   ret %struct.64 %ret
@@ -66,7 +66,7 @@ define %struct.64 @test_return_type_mismatch(ptr %p) {
 define i64 @test_param_type_mismatch(ptr %p) {
 ; CHECK-LABEL: test_param_type_mismatch(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<4>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    { // callseq 1, 0
@@ -87,7 +87,7 @@ define i64 @test_param_type_mismatch(ptr %p) {
 define i64 @test_param_count_mismatch(ptr %p) {
 ; CHECK-LABEL: test_param_count_mismatch(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [test_param_count_mismatch_param_0];
@@ -111,7 +111,7 @@ define i64 @test_param_count_mismatch(ptr %p) {
 define %struct.64 @test_return_type_mismatch_variadic(ptr %p) {
 ; CHECK-LABEL: test_return_type_mismatch_variadic(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<40>;
+; CHECK-NEXT:    .reg .b64 %rd<32>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [test_return_type_mismatch_variadic_param_0];
@@ -131,35 +131,35 @@ define %struct.64 @test_return_type_mismatch_variadic(ptr %p) {
 ; CHECK-NEXT:    ld.param.b8 %rd9, [retval0+1];
 ; CHECK-NEXT:    ld.param.b8 %rd10, [retval0];
 ; CHECK-NEXT:    } // callseq 3
-; CHECK-NEXT:    shl.b64 %rd13, %rd9, 8;
-; CHECK-NEXT:    or.b64 %rd14, %rd13, %rd10;
-; CHECK-NEXT:    shl.b64 %rd16, %rd8, 16;
-; CHECK-NEXT:    shl.b64 %rd18, %rd7, 24;
-; CHECK-NEXT:    or.b64 %rd19, %rd18, %rd16;
-; CHECK-NEXT:    or.b64 %rd20, %rd19, %rd14;
-; CHECK-NEXT:    shl.b64 %rd23, %rd5, 8;
-; CHECK-NEXT:    or.b64 %rd24, %rd23, %rd6;
-; CHECK-NEXT:    shl.b64 %rd26, %rd4, 16;
-; CHECK-NEXT:    shl.b64 %rd28, %rd3, 24;
-; CHECK-NEXT:    or.b64 %rd29, %rd28, %rd26;
-; CHECK-NEXT:    or.b64 %rd30, %rd29, %rd24;
-; CHECK-NEXT:    shl.b64 %rd31, %rd30, 32;
-; CHECK-NEXT:    or.b64 %rd32, %rd31, %rd20;
+; CHECK-NEXT:    shl.b64 %rd11, %rd9, 8;
+; CHECK-NEXT:    or.b64 %rd12, %rd11, %rd10;
+; CHECK-NEXT:    shl.b64 %rd13, %rd8, 16;
+; CHECK-NEXT:    shl.b64 %rd14, %rd7, 24;
+; CHECK-NEXT:    or.b64 %rd15, %rd14, %rd13;
+; CHECK-NEXT:    or.b64 %rd16, %rd15, %rd12;
+; CHECK-NEXT:    shl.b64 %rd17, %rd5, 8;
+; CHECK-NEXT:    or.b64 %rd18, %rd17, %rd6;
+; CHECK-NEXT:    shl.b64 %rd19, %rd4, 16;
+; CHECK-NEXT:    shl.b64 %rd20, %rd3, 24;
+; CHECK-NEXT:    or.b64 %rd21, %rd20, %rd19;
+; CHECK-NEXT:    or.b64 %rd22, %rd21, %rd18;
+; CHECK-NEXT:    shl.b64 %rd23, %rd22, 32;
+; CHECK-NEXT:    or.b64 %rd24, %rd23, %rd16;
 ; CHECK-NEXT:    st.param.b8 [func_retval0], %rd10;
-; CHECK-NEXT:    shr.u64 %rd33, %rd32, 56;
-; CHECK-NEXT:    st.param.b8 [func_retval0+7], %rd33;
-; CHECK-NEXT:    shr.u64 %rd34, %rd32, 48;
-; CHECK-NEXT:    st.param.b8 [func_retval0+6], %rd34;
-; CHECK-NEXT:    shr.u64 %rd35, %rd32, 40;
-; CHECK-NEXT:    st.param.b8 [func_retval0+5], %rd35;
-; CHECK-NEXT:    shr.u64 %rd36, %rd32, 32;
-; CHECK-NEXT:    st.param.b8 [func_retval0+4], %rd36;
-; CHECK-NEXT:    shr.u64 %rd37, %rd32, 24;
-; CHECK-NEXT:    st.param.b8 [func_retval0+3], %rd37;
-; CHECK-NEXT:    shr.u64 %rd38, %rd32, 16;
-; CHECK-NEXT:    st.param.b8 [func_retval0+2], %rd38;
-; CHECK-NEXT:    shr.u64 %rd39, %rd32, 8;
-; CHECK-NEXT:    st.param.b8 [func_retval0+1], %rd39;
+; CHECK-NEXT:    shr.u64 %rd25, %rd24, 56;
+; CHECK-NEXT:    st.param.b8 [func_retval0+7], %rd25;
+; CHECK-NEXT:    shr.u64 %rd26, %rd24, 48;
+; CHECK-NEXT:    st.param.b8 [func_retval0+6], %rd26;
+; CHECK-NEXT:    shr.u64 %rd27, %rd24, 40;
+; CHECK-NEXT:    st.param.b8 [func_retval0+5], %rd27;
+; CHECK-NEXT:    shr.u64 %rd28, %rd24, 32;
+; CHECK-NEXT:    st.param.b8 [func_retval0+4], %rd28;
+; CHECK-NEXT:    shr.u64 %rd29, %rd24, 24;
+; CHECK-NEXT:    st.param.b8 [func_retval0+3], %rd29;
+; CHECK-NEXT:    shr.u64 %rd30, %rd24, 16;
+; CHECK-NEXT:    st.param.b8 [func_retval0+2], %rd30;
+; CHECK-NEXT:    shr.u64 %rd31, %rd24, 8;
+; CHECK-NEXT:    st.param.b8 [func_retval0+1], %rd31;
 ; CHECK-NEXT:    ret;
   %ret = call %struct.64 (ptr, ...) @callee_variadic(ptr %p)
   ret %struct.64 %ret
@@ -168,7 +168,7 @@ define %struct.64 @test_return_type_mismatch_variadic(ptr %p) {
 define i64 @test_param_type_mismatch_variadic(ptr %p) {
 ; CHECK-LABEL: test_param_type_mismatch_variadic(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<4>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [test_param_type_mismatch_variadic_param_0];
@@ -190,7 +190,7 @@ define i64 @test_param_type_mismatch_variadic(ptr %p) {
 define i64 @test_param_count_mismatch_variadic(ptr %p) {
 ; CHECK-LABEL: test_param_count_mismatch_variadic(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<4>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [test_param_count_mismatch_variadic_param_0];
diff --git a/llvm/test/CodeGen/NVPTX/cse-mov-sym.ll b/llvm/test/CodeGen/NVPTX/cse-mov-sym.ll
index 31ecce5a66b64..2e68208786d24 100644
--- a/llvm/test/CodeGen/NVPTX/cse-mov-sym.ll
+++ b/llvm/test/CodeGen/NVPTX/cse-mov-sym.ll
@@ -13,33 +13,33 @@ define i32 @test_mov_sym(i32 %offset1, i32 %offset2, i1 %cond) {
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<4>;
 ; CHECK-NEXT:    .reg .b16 %rs<3>;
-; CHECK-NEXT:    .reg .b32 %r<8>;
-; CHECK-NEXT:    .reg .b64 %rd<7>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<6>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    ld.param.b8 %rs1, [test_mov_sym_param_2];
 ; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
 ; CHECK-NEXT:    setp.ne.b16 %p1, %rs2, 0;
-; CHECK-NEXT:    ld.param.b32 %r4, [test_mov_sym_param_0];
-; CHECK-NEXT:    cvt.s64.s32 %rd1, %r4;
+; CHECK-NEXT:    ld.param.b32 %r1, [test_mov_sym_param_0];
+; CHECK-NEXT:    cvt.s64.s32 %rd1, %r1;
 ; CHECK-NEXT:    mov.b64 %rd2, global_smem;
 ; CHECK-NEXT:    add.s64 %rd3, %rd2, %rd1;
-; CHECK-NEXT:    ld.shared.b32 %r7, [%rd3];
+; CHECK-NEXT:    ld.shared.b32 %r4, [%rd3];
 ; CHECK-NEXT:    not.pred %p2, %p1;
 ; CHECK-NEXT:    @%p2 bra $L__BB0_4;
 ; CHECK-NEXT:  // %bb.1: // %if1.preheader
-; CHECK-NEXT:    ld.param.b32 %r5, [test_mov_sym_param_1];
-; CHECK-NEXT:    setp.ne.b32 %p3, %r4, %r5;
+; CHECK-NEXT:    ld.param.b32 %r2, [test_mov_sym_param_1];
+; CHECK-NEXT:    setp.ne.b32 %p3, %r1, %r2;
 ; CHECK-NEXT:  $L__BB0_2: // %if1
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    @%p3 bra $L__BB0_2;
 ; CHECK-NEXT:  // %bb.3: // %if2
-; CHECK-NEXT:    cvt.s64.s32 %rd4, %r5;
-; CHECK-NEXT:    add.s64 %rd6, %rd2, %rd4;
-; CHECK-NEXT:    ld.shared.b32 %r6, [%rd6];
-; CHECK-NEXT:    add.s32 %r7, %r7, %r6;
+; CHECK-NEXT:    cvt.s64.s32 %rd4, %r2;
+; CHECK-NEXT:    add.s64 %rd5, %rd2, %rd4;
+; CHECK-NEXT:    ld.shared.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.s32 %r4, %r4, %r3;
 ; CHECK-NEXT:  $L__BB0_4: // %end
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r7;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
 ; CHECK-NEXT:    ret;
 entry:
     %gep = getelementptr inbounds i8, ptr addrspace(3) @global_smem, i32 %offset1
diff --git a/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll b/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll
index 2841e6751d029..1d70b9deb6089 100644
--- a/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll
+++ b/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll
@@ -166,23 +166,23 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr
 ; CHECK-LABEL: test_distributed_shared_cluster_cmpxchg(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<11>;
-; CHECK-NEXT:    .reg .b32 %r<53>;
+; CHECK-NEXT:    .reg .b32 %r<43>;
 ; CHECK-NEXT:    .reg .b64 %rd<12>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    ld.param.b64 %rd2, [test_distributed_shared_cluster_cmpxchg_param_0];
-; CHECK-NEXT:    atom.relaxed.sys.shared::cluster.cas.b32 %r24, [%rd2], 1, 0;
-; CHECK-NEXT:    atom.acquire.sys.shared::cluster.cas.b32 %r25, [%rd2], 1, 0;
-; CHECK-NEXT:    atom.acquire.sys.shared::cluster.cas.b32 %r26, [%rd2], 1, 0;
-; CHECK-NEXT:    atom.release.sys.shared::cluster.cas.b32 %r27, [%rd2], 1, 0;
-; CHECK-NEXT:    atom.acq_rel.sys.shared::cluster.cas.b32 %r28, [%rd2], 1, 0;
-; CHECK-NEXT:    atom.acq_rel.sys.shared::cluster.cas.b32 %r29, [%rd2], 1, 0;
+; CHECK-NEXT:    atom.relaxed.sys.shared::cluster.cas.b32 %r14, [%rd2], 1, 0;
+; CHECK-NEXT:    atom.acquire.sys.shared::cluster.cas.b32 %r15, [%rd2], 1, 0;
+; CHECK-NEXT:    atom.acquire.sys.shared::cluster.cas.b32 %r16, [%rd2], 1, 0;
+; CHECK-NEXT:    atom.release.sys.shared::cluster.cas.b32 %r17, [%rd2], 1, 0;
+; CHECK-NEXT:    atom.acq_rel.sys.shared::cluster.cas.b32 %r18, [%rd2], 1, 0;
+; CHECK-NEXT:    atom.acq_rel.sys.shared::cluster.cas.b32 %r19, [%rd2], 1, 0;
 ; CHECK-NEXT:    fence.sc.sys;
-; CHECK-NEXT:    atom.acquire.sys.shared::cluster.cas.b32 %r30, [%rd2], 1, 0;
+; CHECK-NEXT:    atom.acquire.sys.shared::cluster.cas.b32 %r20, [%rd2], 1, 0;
 ; CHECK-NEXT:    fence.sc.sys;
-; CHECK-NEXT:    atom.acquire.sys.shared::cluster.cas.b32 %r31, [%rd2], 1, 0;
+; CHECK-NEXT:    atom.acquire.sys.shared::cluster.cas.b32 %r21, [%rd2], 1, 0;
 ; CHECK-NEXT:    fence.sc.sys;
-; CHECK-NEXT:    atom.acquire.sys.shared::cluster.cas.b32 %r32, [%rd2], 1, 0;
+; CHECK-NEXT:    atom.acquire.sys.shared::cluster.cas.b32 %r22, [%rd2], 1, 0;
 ; CHECK-NEXT:    atom.relaxed.sys.shared::cluster.cas.b64 %rd3, [%rd2], 1, 0;
 ; CHECK-NEXT:    atom.acquire.sys.shared::cluster.cas.b64 %rd4, [%rd2], 1, 0;
 ; CHECK-NEXT:    atom.acquire.sys.shared::cluster.cas.b64 %rd5, [%rd2], 1, 0;
@@ -196,92 +196,92 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr
 ; CHECK-NEXT:    fence.sc.sys;
 ; CHECK-NEXT:    atom.acquire.sys.shared::cluster.cas.b64 %rd11, [%rd2], 1, 0;
 ; CHECK-NEXT:    and.b64 %rd1, %rd2, -4;
-; CHECK-NEXT:    cvt.u32.u64 %r33, %rd2;
-; CHECK-NEXT:    and.b32 %r34, %r33, 3;
-; CHECK-NEXT:    shl.b32 %r1, %r34, 3;
-; CHECK-NEXT:    mov.b32 %r35, 65535;
-; CHECK-NEXT:    shl.b32 %r36, %r35, %r1;
-; CHECK-NEXT:    not.b32 %r2, %r36;
-; CHECK-NEXT:    mov.b32 %r37, 1;
-; CHECK-NEXT:    shl.b32 %r3, %r37, %r1;
-; CHECK-NEXT:    ld.shared::cluster.b32 %r38, [%rd1];
-; CHECK-NEXT:    and.b32 %r48, %r38, %r2;
+; CHECK-NEXT:    cvt.u32.u64 %r23, %rd2;
+; CHECK-NEXT:    and.b32 %r24, %r23, 3;
+; CHECK-NEXT:    shl.b32 %r1, %r24, 3;
+; CHECK-NEXT:    mov.b32 %r25, 65535;
+; CHECK-NEXT:    shl.b32 %r26, %r25, %r1;
+; CHECK-NEXT:    not.b32 %r2, %r26;
+; CHECK-NEXT:    mov.b32 %r27, 1;
+; CHECK-NEXT:    shl.b32 %r3, %r27, %r1;
+; CHECK-NEXT:    ld.shared::cluster.b32 %r28, [%rd1];
+; CHECK-NEXT:    and.b32 %r38, %r28, %r2;
 ; CHECK-NEXT:  $L__BB4_1: // %partword.cmpxchg.loop33
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    or.b32 %r39, %r48, %r3;
-; CHECK-NEXT:    atom.relaxed.sys.shared::cluster.cas.b32 %r6, [%rd1], %r39, %r48;
-; CHECK-NEXT:    setp.eq.b32 %p1, %r6, %r39;
+; CHECK-NEXT:    or.b32 %r29, %r38, %r3;
+; CHECK-NEXT:    atom.relaxed.sys.shared::cluster.cas.b32 %r4, [%rd1], %r29, %r38;
+; CHECK-NEXT:    setp.eq.b32 %p1, %r4, %r29;
 ; CHECK-NEXT:    @%p1 bra $L__BB4_3;
 ; CHECK-NEXT:  // %bb.2: // %partword.cmpxchg.failure32
 ; CHECK-NEXT:    // in Loop: Header=BB4_1 Depth=1
-; CHECK-NEXT:    and.b32 %r7, %r6, %r2;
-; CHECK-NEXT:    setp.ne.b32 %p2, %r48, %r7;
-; CHECK-NEXT:    mov.b32 %r48, %r7;
+; CHECK-NEXT:    and.b32 %r5, %r4, %r2;
+; CHECK-NEXT:    setp.ne.b32 %p2, %r38, %r5;
+; CHECK-NEXT:    mov.b32 %r38, %r5;
 ; CHECK-NEXT:    @%p2 bra $L__BB4_1;
 ; CHECK-NEXT:  $L__BB4_3: // %partword.cmpxchg.end31
-; CHECK-NEXT:    ld.shared::cluster.b32 %r40, [%rd1];
-; CHECK-NEXT:    and.b32 %r49, %r40, %r2;
+; CHECK-NEXT:    ld.shared::cluster.b32 %r30, [%rd1];
+; CHECK-NEXT:    and.b32 %r39, %r30, %r2;
 ; CHECK-NEXT:  $L__BB4_4: // %partword.cmpxchg.loop23
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    or.b32 %r41, %r49, %r3;
-; CHECK-NEXT:    atom.relaxed.sys.shared::cluster.cas.b32 %r10, [%rd1], %r41, %r49;
-; CHECK-NEXT:    setp.eq.b32 %p3, %r10, %r41;
+; CHECK-NEXT:    or.b32 %r31, %r39, %r3;
+; CHECK-NEXT:    atom.relaxed.sys.shared::cluster.cas.b32 %r6, [%rd1], %r31, %r39;
+; CHECK-NEXT:    setp.eq.b32 %p3, %r6, %r31;
 ; CHECK-NEXT:    @%p3 bra $L__BB4_6;
 ; CHECK-NEXT:  // %bb.5: // %partword.cmpxchg.failure22
 ; CHECK-NEXT:    // in Loop: Header=BB4_4 Depth=1
-; CHECK-NEXT:    and.b32 %r11, %r10, %r2;
-; CHECK-NEXT:    setp.ne.b32 %p4, %r49, %r11;
-; CHECK-NEXT:    mov.b32 %r49, %r11;
+; CHECK-NEXT:    and.b32 %r7, %r6, %r2;
+; CHECK-NEXT:    setp.ne.b32 %p4, %r39, %r7;
+; CHECK-NEXT:    mov.b32 %r39, %r7;
 ; CHECK-NEXT:    @%p4 bra $L__BB4_4;
 ; CHECK-NEXT:  $L__BB4_6: // %partword.cmpxchg.end21
 ; CHECK-NEXT:    fence.acq_rel.sys;
 ; CHECK-NEXT:    fence.acq_rel.sys;
-; CHECK-NEXT:    ld.shared::cluster.b32 %r42, [%rd1];
-; CHECK-NEXT:    and.b32 %r50, %r42, %r2;
+; CHECK-NEXT:    ld.shared::cluster.b32 %r32, [%rd1];
+; CHECK-NEXT:    and.b32 %r40, %r32, %r2;
 ; CHECK-NEXT:  $L__BB4_7: // %partword.cmpxchg.loop13
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    or.b32 %r43, %r50, %r3;
-; CHECK-NEXT:    atom.relaxed.sys.shared::cluster.cas.b32 %r14, [%rd1], %r43, %r50;
-; CHECK-NEXT:    setp.eq.b32 %p5, %r14, %r43;
+; CHECK-NEXT:    or.b32 %r33, %r40, %r3;
+; CHECK-NEXT:    atom.relaxed.sys.shared::cluster.cas.b32 %r8, [%rd1], %r33, %r40;
+; CHECK-NEXT:    setp.eq.b32 %p5, %r8, %r33;
 ; CHECK-NEXT:    @%p5 bra $L__BB4_9;
 ; CHECK-NEXT:  // %bb.8: // %partword.cmpxchg.failure12
 ; CHECK-NEXT:    // in Loop: Header=BB4_7 Depth=1
-; CHECK-NEXT:    and.b32 %r15, %r14, %r2;
-; CHECK-NEXT:    setp.ne.b32 %p6, %r50, %r15;
-; CHECK-NEXT:    mov.b32 %r50, %r15;
+; CHECK-NEXT:    and.b32 %r9, %r8, %r2;
+; CHECK-NEXT:    setp.ne.b32 %p6, %r40, %r9;
+; CHECK-NEXT:    mov.b32 %r40, %r9;
 ; CHECK-NEXT:    @%p6 bra $L__BB4_7;
 ; CHECK-NEXT:  $L__BB4_9: // %partword.cmpxchg.end11
 ; CHECK-NEXT:    fence.acq_rel.sys;
-; CHECK-NEXT:    ld.shared::cluster.b32 %r44, [%rd1];
-; CHECK-NEXT:    and.b32 %r51, %r44, %r2;
+; CHECK-NEXT:    ld.shared::cluster.b32 %r34, [%rd1];
+; CHECK-NEXT:    and.b32 %r41, %r34, %r2;
 ; CHECK-NEXT:  $L__BB4_10: // %partword.cmpxchg.loop3
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    or.b32 %r45, %r51, %r3;
-; CHECK-NEXT:    atom.relaxed.sys.shared::cluster.cas.b32 %r18, [%rd1], %r45, %r51;
-; CHECK-NEXT:    setp.eq.b32 %p7, %r18, %r45;
+; CHECK-NEXT:    or.b32 %r35, %r41, %r3;
+; CHECK-NEXT:    atom.relaxed.sys.shared::cluster.cas.b32 %r10, [%rd1], %r35, %r41;
+; CHECK-NEXT:    setp.eq.b32 %p7, %r10, %r35;
 ; CHECK-NEXT:    @%p7 bra $L__BB4_12;
 ; CHECK-NEXT:  // %bb.11: // %partword.cmpxchg.failure2
 ; CHECK-NEXT:    // in Loop: Header=BB4_10 Depth=1
-; CHECK-NEXT:    and.b32 %r19, %r18, %r2;
-; CHECK-NEXT:    setp.ne.b32 %p8, %r51, %r19;
-; CHECK-NEXT:    mov.b32 %r51, %r19;
+; CHECK-NEXT:    and.b32 %r11, %r10, %r2;
+; CHECK-NEXT:    setp.ne.b32 %p8, %r41, %r11;
+; CHECK-NEXT:    mov.b32 %r41, %r11;
 ; CHECK-NEXT:    @%p8 bra $L__BB4_10;
 ; CHECK-NEXT:  $L__BB4_12: // %partword.cmpxchg.end1
 ; CHECK-NEXT:    fence.acq_rel.sys;
 ; CHECK-NEXT:    fence.sc.sys;
-; CHECK-NEXT:    ld.shared::cluster.b32 %r46, [%rd1];
-; CHECK-NEXT:    and.b32 %r52, %r46, %r2;
+; CHECK-NEXT:    ld.shared::cluster.b32 %r36, [%rd1];
+; CHECK-NEXT:    and.b32 %r42, %r36, %r2;
 ; CHECK-NEXT:  $L__BB4_13: // %partword.cmpxchg.loop
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    or.b32 %r47, %r52, %r3;
-; CHECK-NEXT:    atom.relaxed.sys.shared::cluster.cas.b32 %r22, [%rd1], %r47, %r52;
-; CHECK-NEXT:    setp.eq.b32 %p9, %r22, %r47;
+; CHECK-NEXT:    or.b32 %r37, %r42, %r3;
+; CHECK-NEXT:    atom.relaxed.sys.shared::cluster.cas.b32 %r12, [%rd1], %r37, %r42;
+; CHECK-NEXT:    setp.eq.b32 %p9, %r12, %r37;
 ; CHECK-NEXT:    @%p9 bra $L__BB4_15;
 ; CHECK-NEXT:  // %bb.14: // %partword.cmpxchg.failure
 ; CHECK-NEXT:    // in Loop: Header=BB4_13 Depth=1
-; CHECK-NEXT:    and.b32 %r23, %r22, %r2;
-; CHECK-NEXT:    setp.ne.b32 %p10, %r52, %r23;
-; CHECK-NEXT:    mov.b32 %r52, %r23;
+; CHECK-NEXT:    and.b32 %r13, %r12, %r2;
+; CHECK-NEXT:    setp.ne.b32 %p10, %r42, %r13;
+; CHECK-NEXT:    mov.b32 %r42, %r13;
 ; CHECK-NEXT:    @%p10 bra $L__BB4_13;
 ; CHECK-NEXT:  $L__BB4_15: // %partword.cmpxchg.end
 ; CHECK-NEXT:    fence.acq_rel.sys;
diff --git a/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll b/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll
index 06fb8d2c7c54d..ce2f0f32a8748 100644
--- a/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll
+++ b/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll
@@ -12,7 +12,7 @@
 define i32 @test_dynamic_stackalloc(i64 %n) {
 ; CHECK-32-LABEL: test_dynamic_stackalloc(
 ; CHECK-32:       {
-; CHECK-32-NEXT:    .reg .b32 %r<8>;
+; CHECK-32-NEXT:    .reg .b32 %r<7>;
 ; CHECK-32-EMPTY:
 ; CHECK-32-NEXT:  // %bb.0:
 ; CHECK-32-NEXT:    ld.param.b32 %r1, [test_dynamic_stackalloc_param_0];
@@ -32,7 +32,7 @@ define i32 @test_dynamic_stackalloc(i64 %n) {
 ;
 ; CHECK-64-LABEL: test_dynamic_stackalloc(
 ; CHECK-64:       {
-; CHECK-64-NEXT:    .reg .b32 %r<3>;
+; CHECK-64-NEXT:    .reg .b32 %r<2>;
 ; CHECK-64-NEXT:    .reg .b64 %rd<6>;
 ; CHECK-64-EMPTY:
 ; CHECK-64-NEXT:  // %bb.0:
diff --git a/llvm/test/CodeGen/NVPTX/extractelement.ll b/llvm/test/CodeGen/NVPTX/extractelement.ll
index d61a63ce24f89..6d67ed0b4d539 100644
--- a/llvm/test/CodeGen/NVPTX/extractelement.ll
+++ b/llvm/test/CodeGen/NVPTX/extractelement.ll
@@ -31,16 +31,16 @@ define i1  @test_v2i8_load(ptr %a) {
 ; CHECK-LABEL: test_v2i8_load(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<2>;
-; CHECK-NEXT:    .reg .b16 %rs<7>;
+; CHECK-NEXT:    .reg .b16 %rs<5>;
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [test_v2i8_load_param_0];
 ; CHECK-NEXT:    ld.v2.b8 {%rs1, %rs2}, [%rd1];
-; CHECK-NEXT:    or.b16 %rs5, %rs1, %rs2;
-; CHECK-NEXT:    and.b16 %rs6, %rs5, 255;
-; CHECK-NEXT:    setp.eq.b16 %p1, %rs6, 0;
+; CHECK-NEXT:    or.b16 %rs3, %rs1, %rs2;
+; CHECK-NEXT:    and.b16 %rs4, %rs3, 255;
+; CHECK-NEXT:    setp.eq.b16 %p1, %rs4, 0;
 ; CHECK-NEXT:    selp.b32 %r1, -1, 0, %p1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
diff --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
index 991311f9492b9..64c7792a61c8c 100644
--- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
@@ -455,7 +455,7 @@ declare <2 x half> @test_callee(<2 x half> %a, <2 x half> %b) #0
 define <2 x half> @test_call(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-LABEL: test_call(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r2, [test_call_param_1];
@@ -478,7 +478,7 @@ define <2 x half> @test_call(<2 x half> %a, <2 x half> %b) #0 {
 define <2 x half> @test_call_flipped(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-LABEL: test_call_flipped(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r2, [test_call_flipped_param_1];
@@ -501,7 +501,7 @@ define <2 x half> @test_call_flipped(<2 x half> %a, <2 x half> %b) #0 {
 define <2 x half> @test_tailcall_flipped(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-LABEL: test_tailcall_flipped(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r2, [test_tailcall_flipped_param_1];
diff --git a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
index 467459759c42c..bcaefa1699d8b 100644
--- a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
@@ -863,7 +863,7 @@ declare <2 x float> @test_callee(<2 x float> %a, <2 x float> %b) #0
 define <2 x float> @test_call(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-LABEL: test_call(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd2, [test_call_param_1];
@@ -886,7 +886,7 @@ define <2 x float> @test_call(<2 x float> %a, <2 x float> %b) #0 {
 define <2 x float> @test_call_flipped(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-LABEL: test_call_flipped(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd2, [test_call_flipped_param_1];
@@ -909,7 +909,7 @@ define <2 x float> @test_call_flipped(<2 x float> %a, <2 x float> %b) #0 {
 define <2 x float> @test_tailcall_flipped(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-LABEL: test_tailcall_flipped(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd2, [test_tailcall_flipped_param_1];
diff --git a/llvm/test/CodeGen/NVPTX/fma.ll b/llvm/test/CodeGen/NVPTX/fma.ll
index 87274aa759bea..ba4bb76d25113 100644
--- a/llvm/test/CodeGen/NVPTX/fma.ll
+++ b/llvm/test/CodeGen/NVPTX/fma.ll
@@ -25,7 +25,7 @@ define ptx_device float @t1_f32(float %x, float %y, float %z) {
 define ptx_device float @t2_f32(float %x, float %y, float %z, float %w) {
 ; CHECK-LABEL: t2_f32(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-NEXT:    .reg .b32 %r<8>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [t2_f32_param_0];
@@ -72,7 +72,7 @@ define ptx_device double @t1_f64(double %x, double %y, double %z) {
 define ptx_device double @t2_f64(double %x, double %y, double %z, double %w) {
 ; CHECK-LABEL: t2_f64(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<9>;
+; CHECK-NEXT:    .reg .b64 %rd<8>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [t2_f64_param_0];
diff --git a/llvm/test/CodeGen/NVPTX/forward-ld-param.ll b/llvm/test/CodeGen/NVPTX/forward-ld-param.ll
index 636e12bf98943..4f1454d3788a4 100644
--- a/llvm/test/CodeGen/NVPTX/forward-ld-param.ll
+++ b/llvm/test/CodeGen/NVPTX/forward-ld-param.ll
@@ -7,7 +7,6 @@ define i32 @test_ld_param_const(ptr byval(i32) %a) {
 ; CHECK-LABEL: test_ld_param_const(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_ld_param_const_param_0+4];
@@ -61,7 +60,6 @@ define void @test_ld_param_byval(ptr byval(i32) %a) {
 ; CHECK-LABEL: test_ld_param_byval(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    { // callseq 1, 0
@@ -98,8 +96,7 @@ define i32 @test_multi_block(ptr byval([10 x i32]) %a, i1 %p) {
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<3>;
 ; CHECK-NEXT:    .reg .b16 %rs<3>;
-; CHECK-NEXT:    .reg .b32 %r<5>;
-; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b8 %rs1, [test_multi_block_param_1];
@@ -108,12 +105,12 @@ define i32 @test_multi_block(ptr byval([10 x i32]) %a, i1 %p) {
 ; CHECK-NEXT:    not.pred %p2, %p1;
 ; CHECK-NEXT:    @%p2 bra $L__BB5_2;
 ; CHECK-NEXT:  // %bb.1: // %if
-; CHECK-NEXT:    ld.param.b32 %r4, [test_multi_block_param_0+4];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_multi_block_param_0+4];
 ; CHECK-NEXT:    bra.uni $L__BB5_3;
 ; CHECK-NEXT:  $L__BB5_2: // %else
-; CHECK-NEXT:    ld.param.b32 %r4, [test_multi_block_param_0+8];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_multi_block_param_0+8];
 ; CHECK-NEXT:  $L__BB5_3: // %end
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
   br i1 %p, label %if, label %else
 if:
diff --git a/llvm/test/CodeGen/NVPTX/i1-select.ll b/llvm/test/CodeGen/NVPTX/i1-select.ll
index df32e2a4cfad2..264f38021e1de 100644
--- a/llvm/test/CodeGen/NVPTX/i1-select.ll
+++ b/llvm/test/CodeGen/NVPTX/i1-select.ll
@@ -66,22 +66,22 @@ define i32 @test_select_i1_basic(i32 %v1, i32 %v2, i32 %v3, i32 %true, i32 %fals
 ; CHECK-LABEL: test_select_i1_basic(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<4>;
-; CHECK-NEXT:    .reg .b32 %r<12>;
+; CHECK-NEXT:    .reg .b32 %r<10>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_select_i1_basic_param_0];
 ; CHECK-NEXT:    ld.param.b32 %r2, [test_select_i1_basic_param_1];
-; CHECK-NEXT:    or.b32 %r4, %r1, %r2;
+; CHECK-NEXT:    or.b32 %r3, %r1, %r2;
 ; CHECK-NEXT:    setp.ne.b32 %p1, %r1, 0;
-; CHECK-NEXT:    ld.param.b32 %r5, [test_select_i1_basic_param_2];
-; CHECK-NEXT:    setp.eq.b32 %p2, %r5, 0;
-; CHECK-NEXT:    ld.param.b32 %r7, [test_select_i1_basic_param_3];
-; CHECK-NEXT:    setp.eq.b32 %p3, %r4, 0;
-; CHECK-NEXT:    ld.param.b32 %r8, [test_select_i1_basic_param_4];
-; CHECK-NEXT:    selp.b32 %r9, %r7, %r8, %p2;
-; CHECK-NEXT:    selp.b32 %r10, %r9, %r8, %p1;
-; CHECK-NEXT:    selp.b32 %r11, %r7, %r10, %p3;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r11;
+; CHECK-NEXT:    ld.param.b32 %r4, [test_select_i1_basic_param_2];
+; CHECK-NEXT:    setp.eq.b32 %p2, %r4, 0;
+; CHECK-NEXT:    ld.param.b32 %r5, [test_select_i1_basic_param_3];
+; CHECK-NEXT:    setp.eq.b32 %p3, %r3, 0;
+; CHECK-NEXT:    ld.param.b32 %r6, [test_select_i1_basic_param_4];
+; CHECK-NEXT:    selp.b32 %r7, %r5, %r6, %p2;
+; CHECK-NEXT:    selp.b32 %r8, %r7, %r6, %p1;
+; CHECK-NEXT:    selp.b32 %r9, %r5, %r8, %p3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r9;
 ; CHECK-NEXT:    ret;
   %b1 = icmp eq i32 %v1, 0
   %b2 = icmp eq i32 %v2, 0
@@ -94,7 +94,7 @@ define i32 @test_select_i1_basic(i32 %v1, i32 %v2, i32 %v3, i32 %true, i32 %fals
 define i32 @test_select_i1_basic_folding(i32 %v1, i32 %v2, i32 %v3, i32 %true, i32 %false) {
 ; CHECK-LABEL: test_select_i1_basic_folding(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .pred %p<13>;
+; CHECK-NEXT:    .reg .pred %p<11>;
 ; CHECK-NEXT:    .reg .b32 %r<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
@@ -106,14 +106,14 @@ define i32 @test_select_i1_basic_folding(i32 %v1, i32 %v2, i32 %v3, i32 %true, i
 ; CHECK-NEXT:    ld.param.b32 %r3, [test_select_i1_basic_folding_param_2];
 ; CHECK-NEXT:    setp.eq.b32 %p4, %r3, 0;
 ; CHECK-NEXT:    ld.param.b32 %r4, [test_select_i1_basic_folding_param_3];
-; CHECK-NEXT:    xor.pred %p6, %p1, %p3;
+; CHECK-NEXT:    xor.pred %p5, %p1, %p3;
 ; CHECK-NEXT:    ld.param.b32 %r5, [test_select_i1_basic_folding_param_4];
-; CHECK-NEXT:    and.pred %p8, %p6, %p4;
-; CHECK-NEXT:    and.pred %p9, %p2, %p4;
-; CHECK-NEXT:    and.pred %p10, %p3, %p8;
-; CHECK-NEXT:    or.pred %p11, %p10, %p9;
-; CHECK-NEXT:    xor.pred %p12, %p11, %p3;
-; CHECK-NEXT:    selp.b32 %r6, %r4, %r5, %p12;
+; CHECK-NEXT:    and.pred %p6, %p5, %p4;
+; CHECK-NEXT:    and.pred %p7, %p2, %p4;
+; CHECK-NEXT:    and.pred %p8, %p3, %p6;
+; CHECK-NEXT:    or.pred %p9, %p8, %p7;
+; CHECK-NEXT:    xor.pred %p10, %p9, %p3;
+; CHECK-NEXT:    selp.b32 %r6, %r4, %r5, %p10;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
 ; CHECK-NEXT:    ret;
   %b1 = icmp eq i32 %v1, 0
diff --git a/llvm/test/CodeGen/NVPTX/i128-array.ll b/llvm/test/CodeGen/NVPTX/i128-array.ll
index 3bb9c6aec51ac..7bd8a0021f1b5 100644
--- a/llvm/test/CodeGen/NVPTX/i128-array.ll
+++ b/llvm/test/CodeGen/NVPTX/i128-array.ll
@@ -27,13 +27,13 @@ define [2 x i128] @foo(i64 %a, i32 %b) {
 define [2 x i128] @foo2(ptr byval([2 x i128]) %a) {
 ; CHECK-LABEL: foo2(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<7>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [foo2_param_0];
-; CHECK-NEXT:    ld.param.v2.b64 {%rd5, %rd6}, [foo2_param_0+16];
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd3, %rd4};
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0+16], {%rd5, %rd6};
+; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [foo2_param_0];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [foo2_param_0+16];
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd1, %rd2};
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0+16], {%rd3, %rd4};
 ; CHECK-NEXT:    ret;
   %ptr0 = getelementptr [2 x i128], ptr %a, i64 0, i32 0
   %1 = load i128, i128* %ptr0
diff --git a/llvm/test/CodeGen/NVPTX/i128.ll b/llvm/test/CodeGen/NVPTX/i128.ll
index 44d85589b5056..cdbbabe3e3b05 100644
--- a/llvm/test/CodeGen/NVPTX/i128.ll
+++ b/llvm/test/CodeGen/NVPTX/i128.ll
@@ -7,137 +7,137 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) {
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<20>;
 ; CHECK-NEXT:    .reg .b32 %r<12>;
-; CHECK-NEXT:    .reg .b64 %rd<127>;
+; CHECK-NEXT:    .reg .b64 %rd<79>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %_udiv-special-cases
-; CHECK-NEXT:    ld.param.v2.b64 {%rd45, %rd46}, [srem_i128_param_0];
-; CHECK-NEXT:    ld.param.v2.b64 {%rd49, %rd50}, [srem_i128_param_1];
-; CHECK-NEXT:    shr.s64 %rd2, %rd46, 63;
-; CHECK-NEXT:    sub.cc.s64 %rd51, 0, %rd45;
-; CHECK-NEXT:    subc.cc.s64 %rd52, 0, %rd46;
-; CHECK-NEXT:    setp.lt.s64 %p1, %rd46, 0;
-; CHECK-NEXT:    selp.b64 %rd4, %rd52, %rd46, %p1;
-; CHECK-NEXT:    selp.b64 %rd3, %rd51, %rd45, %p1;
-; CHECK-NEXT:    sub.cc.s64 %rd53, 0, %rd49;
-; CHECK-NEXT:    subc.cc.s64 %rd54, 0, %rd50;
-; CHECK-NEXT:    setp.lt.s64 %p2, %rd50, 0;
-; CHECK-NEXT:    selp.b64 %rd6, %rd54, %rd50, %p2;
-; CHECK-NEXT:    selp.b64 %rd5, %rd53, %rd49, %p2;
-; CHECK-NEXT:    or.b64 %rd55, %rd5, %rd6;
-; CHECK-NEXT:    setp.eq.b64 %p3, %rd55, 0;
-; CHECK-NEXT:    or.b64 %rd56, %rd3, %rd4;
-; CHECK-NEXT:    setp.eq.b64 %p4, %rd56, 0;
+; CHECK-NEXT:    ld.param.v2.b64 {%rd8, %rd9}, [srem_i128_param_0];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd10, %rd11}, [srem_i128_param_1];
+; CHECK-NEXT:    shr.s64 %rd1, %rd9, 63;
+; CHECK-NEXT:    sub.cc.s64 %rd12, 0, %rd8;
+; CHECK-NEXT:    subc.cc.s64 %rd13, 0, %rd9;
+; CHECK-NEXT:    setp.lt.s64 %p1, %rd9, 0;
+; CHECK-NEXT:    selp.b64 %rd3, %rd13, %rd9, %p1;
+; CHECK-NEXT:    selp.b64 %rd2, %rd12, %rd8, %p1;
+; CHECK-NEXT:    sub.cc.s64 %rd14, 0, %rd10;
+; CHECK-NEXT:    subc.cc.s64 %rd15, 0, %rd11;
+; CHECK-NEXT:    setp.lt.s64 %p2, %rd11, 0;
+; CHECK-NEXT:    selp.b64 %rd5, %rd15, %rd11, %p2;
+; CHECK-NEXT:    selp.b64 %rd4, %rd14, %rd10, %p2;
+; CHECK-NEXT:    or.b64 %rd16, %rd4, %rd5;
+; CHECK-NEXT:    setp.eq.b64 %p3, %rd16, 0;
+; CHECK-NEXT:    or.b64 %rd17, %rd2, %rd3;
+; CHECK-NEXT:    setp.eq.b64 %p4, %rd17, 0;
 ; CHECK-NEXT:    or.pred %p5, %p3, %p4;
-; CHECK-NEXT:    setp.ne.b64 %p6, %rd6, 0;
-; CHECK-NEXT:    clz.b64 %r1, %rd6;
-; CHECK-NEXT:    cvt.u64.u32 %rd57, %r1;
-; CHECK-NEXT:    clz.b64 %r2, %rd5;
-; CHECK-NEXT:    cvt.u64.u32 %rd58, %r2;
-; CHECK-NEXT:    add.s64 %rd59, %rd58, 64;
-; CHECK-NEXT:    selp.b64 %rd60, %rd57, %rd59, %p6;
-; CHECK-NEXT:    setp.ne.b64 %p7, %rd4, 0;
-; CHECK-NEXT:    clz.b64 %r3, %rd4;
-; CHECK-NEXT:    cvt.u64.u32 %rd61, %r3;
-; CHECK-NEXT:    clz.b64 %r4, %rd3;
-; CHECK-NEXT:    cvt.u64.u32 %rd62, %r4;
-; CHECK-NEXT:    add.s64 %rd63, %rd62, 64;
-; CHECK-NEXT:    selp.b64 %rd64, %rd61, %rd63, %p7;
-; CHECK-NEXT:    mov.b64 %rd117, 0;
-; CHECK-NEXT:    sub.cc.s64 %rd66, %rd60, %rd64;
-; CHECK-NEXT:    subc.cc.s64 %rd67, %rd117, 0;
-; CHECK-NEXT:    setp.gt.u64 %p8, %rd66, 127;
-; CHECK-NEXT:    setp.eq.b64 %p9, %rd67, 0;
+; CHECK-NEXT:    setp.ne.b64 %p6, %rd5, 0;
+; CHECK-NEXT:    clz.b64 %r1, %rd5;
+; CHECK-NEXT:    cvt.u64.u32 %rd18, %r1;
+; CHECK-NEXT:    clz.b64 %r2, %rd4;
+; CHECK-NEXT:    cvt.u64.u32 %rd19, %r2;
+; CHECK-NEXT:    add.s64 %rd20, %rd19, 64;
+; CHECK-NEXT:    selp.b64 %rd21, %rd18, %rd20, %p6;
+; CHECK-NEXT:    setp.ne.b64 %p7, %rd3, 0;
+; CHECK-NEXT:    clz.b64 %r3, %rd3;
+; CHECK-NEXT:    cvt.u64.u32 %rd22, %r3;
+; CHECK-NEXT:    clz.b64 %r4, %rd2;
+; CHECK-NEXT:    cvt.u64.u32 %rd23, %r4;
+; CHECK-NEXT:    add.s64 %rd24, %rd23, 64;
+; CHECK-NEXT:    selp.b64 %rd25, %rd22, %rd24, %p7;
+; CHECK-NEXT:    mov.b64 %rd70, 0;
+; CHECK-NEXT:    sub.cc.s64 %rd26, %rd21, %rd25;
+; CHECK-NEXT:    subc.cc.s64 %rd27, %rd70, 0;
+; CHECK-NEXT:    setp.gt.u64 %p8, %rd26, 127;
+; CHECK-NEXT:    setp.eq.b64 %p9, %rd27, 0;
 ; CHECK-NEXT:    and.pred %p10, %p9, %p8;
-; CHECK-NEXT:    setp.ne.b64 %p11, %rd67, 0;
+; CHECK-NEXT:    setp.ne.b64 %p11, %rd27, 0;
 ; CHECK-NEXT:    or.pred %p12, %p10, %p11;
 ; CHECK-NEXT:    or.pred %p13, %p5, %p12;
-; CHECK-NEXT:    xor.b64 %rd68, %rd66, 127;
-; CHECK-NEXT:    or.b64 %rd69, %rd68, %rd67;
-; CHECK-NEXT:    setp.eq.b64 %p14, %rd69, 0;
-; CHECK-NEXT:    selp.b64 %rd126, 0, %rd4, %p13;
-; CHECK-NEXT:    selp.b64 %rd125, 0, %rd3, %p13;
+; CHECK-NEXT:    xor.b64 %rd28, %rd26, 127;
+; CHECK-NEXT:    or.b64 %rd29, %rd28, %rd27;
+; CHECK-NEXT:    setp.eq.b64 %p14, %rd29, 0;
+; CHECK-NEXT:    selp.b64 %rd78, 0, %rd3, %p13;
+; CHECK-NEXT:    selp.b64 %rd77, 0, %rd2, %p13;
 ; CHECK-NEXT:    or.pred %p15, %p13, %p14;
 ; CHECK-NEXT:    @%p15 bra $L__BB0_5;
 ; CHECK-NEXT:  // %bb.3: // %udiv-bb1
-; CHECK-NEXT:    add.cc.s64 %rd119, %rd66, 1;
-; CHECK-NEXT:    addc.cc.s64 %rd120, %rd67, 0;
-; CHECK-NEXT:    or.b64 %rd72, %rd119, %rd120;
-; CHECK-NEXT:    setp.eq.b64 %p16, %rd72, 0;
-; CHECK-NEXT:    cvt.u32.u64 %r5, %rd66;
+; CHECK-NEXT:    add.cc.s64 %rd71, %rd26, 1;
+; CHECK-NEXT:    addc.cc.s64 %rd72, %rd27, 0;
+; CHECK-NEXT:    or.b64 %rd30, %rd71, %rd72;
+; CHECK-NEXT:    setp.eq.b64 %p16, %rd30, 0;
+; CHECK-NEXT:    cvt.u32.u64 %r5, %rd26;
 ; CHECK-NEXT:    sub.s32 %r6, 127, %r5;
-; CHECK-NEXT:    shl.b64 %rd73, %rd4, %r6;
+; CHECK-NEXT:    shl.b64 %rd31, %rd3, %r6;
 ; CHECK-NEXT:    sub.s32 %r7, 64, %r6;
-; CHECK-NEXT:    shr.u64 %rd74, %rd3, %r7;
-; CHECK-NEXT:    or.b64 %rd75, %rd73, %rd74;
+; CHECK-NEXT:    shr.u64 %rd32, %rd2, %r7;
+; CHECK-NEXT:    or.b64 %rd33, %rd31, %rd32;
 ; CHECK-NEXT:    sub.s32 %r8, 63, %r5;
-; CHECK-NEXT:    shl.b64 %rd76, %rd3, %r8;
+; CHECK-NEXT:    shl.b64 %rd34, %rd2, %r8;
 ; CHECK-NEXT:    setp.gt.s32 %p17, %r6, 63;
-; CHECK-NEXT:    selp.b64 %rd124, %rd76, %rd75, %p17;
-; CHECK-NEXT:    shl.b64 %rd123, %rd3, %r6;
-; CHECK-NEXT:    mov.b64 %rd114, %rd117;
+; CHECK-NEXT:    selp.b64 %rd76, %rd34, %rd33, %p17;
+; CHECK-NEXT:    shl.b64 %rd75, %rd2, %r6;
+; CHECK-NEXT:    mov.b64 %rd69, %rd70;
 ; CHECK-NEXT:    @%p16 bra $L__BB0_4;
 ; CHECK-NEXT:  // %bb.1: // %udiv-preheader
-; CHECK-NEXT:    cvt.u32.u64 %r9, %rd119;
-; CHECK-NEXT:    shr.u64 %rd79, %rd3, %r9;
+; CHECK-NEXT:    cvt.u32.u64 %r9, %rd71;
+; CHECK-NEXT:    shr.u64 %rd35, %rd2, %r9;
 ; CHECK-NEXT:    sub.s32 %r10, 64, %r9;
-; CHECK-NEXT:    shl.b64 %rd80, %rd4, %r10;
-; CHECK-NEXT:    or.b64 %rd81, %rd79, %rd80;
+; CHECK-NEXT:    shl.b64 %rd36, %rd3, %r10;
+; CHECK-NEXT:    or.b64 %rd37, %rd35, %rd36;
 ; CHECK-NEXT:    add.s32 %r11, %r9, -64;
-; CHECK-NEXT:    shr.u64 %rd82, %rd4, %r11;
+; CHECK-NEXT:    shr.u64 %rd38, %rd3, %r11;
 ; CHECK-NEXT:    setp.gt.s32 %p18, %r9, 63;
-; CHECK-NEXT:    selp.b64 %rd121, %rd82, %rd81, %p18;
-; CHECK-NEXT:    shr.u64 %rd122, %rd4, %r9;
-; CHECK-NEXT:    add.cc.s64 %rd35, %rd5, -1;
-; CHECK-NEXT:    addc.cc.s64 %rd36, %rd6, -1;
-; CHECK-NEXT:    mov.b64 %rd114, 0;
-; CHECK-NEXT:    mov.b64 %rd117, %rd114;
+; CHECK-NEXT:    selp.b64 %rd73, %rd38, %rd37, %p18;
+; CHECK-NEXT:    shr.u64 %rd74, %rd3, %r9;
+; CHECK-NEXT:    add.cc.s64 %rd6, %rd4, -1;
+; CHECK-NEXT:    addc.cc.s64 %rd7, %rd5, -1;
+; CHECK-NEXT:    mov.b64 %rd69, 0;
+; CHECK-NEXT:    mov.b64 %rd70, %rd69;
 ; CHECK-NEXT:  $L__BB0_2: // %udiv-do-while
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    shr.u64 %rd83, %rd121, 63;
-; CHECK-NEXT:    shl.b64 %rd84, %rd122, 1;
-; CHECK-NEXT:    or.b64 %rd85, %rd84, %rd83;
-; CHECK-NEXT:    shl.b64 %rd86, %rd121, 1;
-; CHECK-NEXT:    shr.u64 %rd87, %rd124, 63;
-; CHECK-NEXT:    or.b64 %rd88, %rd86, %rd87;
-; CHECK-NEXT:    shr.u64 %rd89, %rd123, 63;
-; CHECK-NEXT:    shl.b64 %rd90, %rd124, 1;
-; CHECK-NEXT:    or.b64 %rd91, %rd90, %rd89;
-; CHECK-NEXT:    shl.b64 %rd92, %rd123, 1;
-; CHECK-NEXT:    or.b64 %rd123, %rd117, %rd92;
-; CHECK-NEXT:    or.b64 %rd124, %rd114, %rd91;
-; CHECK-NEXT:    sub.cc.s64 %rd93, %rd35, %rd88;
-; CHECK-NEXT:    subc.cc.s64 %rd94, %rd36, %rd85;
-; CHECK-NEXT:    shr.s64 %rd95, %rd94, 63;
-; CHECK-NEXT:    and.b64 %rd117, %rd95, 1;
-; CHECK-NEXT:    and.b64 %rd96, %rd95, %rd5;
-; CHECK-NEXT:    and.b64 %rd97, %rd95, %rd6;
-; CHECK-NEXT:    sub.cc.s64 %rd121, %rd88, %rd96;
-; CHECK-NEXT:    subc.cc.s64 %rd122, %rd85, %rd97;
-; CHECK-NEXT:    add.cc.s64 %rd119, %rd119, -1;
-; CHECK-NEXT:    addc.cc.s64 %rd120, %rd120, -1;
-; CHECK-NEXT:    or.b64 %rd98, %rd119, %rd120;
-; CHECK-NEXT:    setp.eq.b64 %p19, %rd98, 0;
+; CHECK-NEXT:    shr.u64 %rd39, %rd73, 63;
+; CHECK-NEXT:    shl.b64 %rd40, %rd74, 1;
+; CHECK-NEXT:    or.b64 %rd41, %rd40, %rd39;
+; CHECK-NEXT:    shl.b64 %rd42, %rd73, 1;
+; CHECK-NEXT:    shr.u64 %rd43, %rd76, 63;
+; CHECK-NEXT:    or.b64 %rd44, %rd42, %rd43;
+; CHECK-NEXT:    shr.u64 %rd45, %rd75, 63;
+; CHECK-NEXT:    shl.b64 %rd46, %rd76, 1;
+; CHECK-NEXT:    or.b64 %rd47, %rd46, %rd45;
+; CHECK-NEXT:    shl.b64 %rd48, %rd75, 1;
+; CHECK-NEXT:    or.b64 %rd75, %rd70, %rd48;
+; CHECK-NEXT:    or.b64 %rd76, %rd69, %rd47;
+; CHECK-NEXT:    sub.cc.s64 %rd49, %rd6, %rd44;
+; CHECK-NEXT:    subc.cc.s64 %rd50, %rd7, %rd41;
+; CHECK-NEXT:    shr.s64 %rd51, %rd50, 63;
+; CHECK-NEXT:    and.b64 %rd70, %rd51, 1;
+; CHECK-NEXT:    and.b64 %rd52, %rd51, %rd4;
+; CHECK-NEXT:    and.b64 %rd53, %rd51, %rd5;
+; CHECK-NEXT:    sub.cc.s64 %rd73, %rd44, %rd52;
+; CHECK-NEXT:    subc.cc.s64 %rd74, %rd41, %rd53;
+; CHECK-NEXT:    add.cc.s64 %rd71, %rd71, -1;
+; CHECK-NEXT:    addc.cc.s64 %rd72, %rd72, -1;
+; CHECK-NEXT:    or.b64 %rd54, %rd71, %rd72;
+; CHECK-NEXT:    setp.eq.b64 %p19, %rd54, 0;
 ; CHECK-NEXT:    @%p19 bra $L__BB0_4;
 ; CHECK-NEXT:    bra.uni $L__BB0_2;
 ; CHECK-NEXT:  $L__BB0_4: // %udiv-loop-exit
-; CHECK-NEXT:    shr.u64 %rd99, %rd123, 63;
-; CHECK-NEXT:    shl.b64 %rd100, %rd124, 1;
-; CHECK-NEXT:    or.b64 %rd101, %rd100, %rd99;
-; CHECK-NEXT:    shl.b64 %rd102, %rd123, 1;
-; CHECK-NEXT:    or.b64 %rd125, %rd117, %rd102;
-; CHECK-NEXT:    or.b64 %rd126, %rd114, %rd101;
+; CHECK-NEXT:    shr.u64 %rd55, %rd75, 63;
+; CHECK-NEXT:    shl.b64 %rd56, %rd76, 1;
+; CHECK-NEXT:    or.b64 %rd57, %rd56, %rd55;
+; CHECK-NEXT:    shl.b64 %rd58, %rd75, 1;
+; CHECK-NEXT:    or.b64 %rd77, %rd70, %rd58;
+; CHECK-NEXT:    or.b64 %rd78, %rd69, %rd57;
 ; CHECK-NEXT:  $L__BB0_5: // %udiv-end
-; CHECK-NEXT:    mul.hi.u64 %rd103, %rd5, %rd125;
-; CHECK-NEXT:    mad.lo.s64 %rd104, %rd5, %rd126, %rd103;
-; CHECK-NEXT:    mad.lo.s64 %rd105, %rd6, %rd125, %rd104;
-; CHECK-NEXT:    mul.lo.s64 %rd106, %rd5, %rd125;
-; CHECK-NEXT:    sub.cc.s64 %rd107, %rd3, %rd106;
-; CHECK-NEXT:    subc.cc.s64 %rd108, %rd4, %rd105;
-; CHECK-NEXT:    xor.b64 %rd109, %rd107, %rd2;
-; CHECK-NEXT:    xor.b64 %rd110, %rd108, %rd2;
-; CHECK-NEXT:    sub.cc.s64 %rd111, %rd109, %rd2;
-; CHECK-NEXT:    subc.cc.s64 %rd112, %rd110, %rd2;
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd111, %rd112};
+; CHECK-NEXT:    mul.hi.u64 %rd59, %rd4, %rd77;
+; CHECK-NEXT:    mad.lo.s64 %rd60, %rd4, %rd78, %rd59;
+; CHECK-NEXT:    mad.lo.s64 %rd61, %rd5, %rd77, %rd60;
+; CHECK-NEXT:    mul.lo.s64 %rd62, %rd4, %rd77;
+; CHECK-NEXT:    sub.cc.s64 %rd63, %rd2, %rd62;
+; CHECK-NEXT:    subc.cc.s64 %rd64, %rd3, %rd61;
+; CHECK-NEXT:    xor.b64 %rd65, %rd63, %rd1;
+; CHECK-NEXT:    xor.b64 %rd66, %rd64, %rd1;
+; CHECK-NEXT:    sub.cc.s64 %rd67, %rd65, %rd1;
+; CHECK-NEXT:    subc.cc.s64 %rd68, %rd66, %rd1;
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd67, %rd68};
 ; CHECK-NEXT:    ret;
   %div = srem i128 %lhs, %rhs
   ret i128 %div
@@ -148,122 +148,122 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) {
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<18>;
 ; CHECK-NEXT:    .reg .b32 %r<12>;
-; CHECK-NEXT:    .reg .b64 %rd<113>;
+; CHECK-NEXT:    .reg .b64 %rd<66>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %_udiv-special-cases
-; CHECK-NEXT:    ld.param.v2.b64 {%rd41, %rd42}, [urem_i128_param_0];
-; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [urem_i128_param_1];
-; CHECK-NEXT:    or.b64 %rd45, %rd3, %rd4;
-; CHECK-NEXT:    setp.eq.b64 %p1, %rd45, 0;
-; CHECK-NEXT:    or.b64 %rd46, %rd41, %rd42;
-; CHECK-NEXT:    setp.eq.b64 %p2, %rd46, 0;
+; CHECK-NEXT:    ld.param.v2.b64 {%rd5, %rd6}, [urem_i128_param_0];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [urem_i128_param_1];
+; CHECK-NEXT:    or.b64 %rd7, %rd1, %rd2;
+; CHECK-NEXT:    setp.eq.b64 %p1, %rd7, 0;
+; CHECK-NEXT:    or.b64 %rd8, %rd5, %rd6;
+; CHECK-NEXT:    setp.eq.b64 %p2, %rd8, 0;
 ; CHECK-NEXT:    or.pred %p3, %p1, %p2;
-; CHECK-NEXT:    setp.ne.b64 %p4, %rd4, 0;
-; CHECK-NEXT:    clz.b64 %r1, %rd4;
-; CHECK-NEXT:    cvt.u64.u32 %rd47, %r1;
-; CHECK-NEXT:    clz.b64 %r2, %rd3;
-; CHECK-NEXT:    cvt.u64.u32 %rd48, %r2;
-; CHECK-NEXT:    add.s64 %rd49, %rd48, 64;
-; CHECK-NEXT:    selp.b64 %rd50, %rd47, %rd49, %p4;
-; CHECK-NEXT:    setp.ne.b64 %p5, %rd42, 0;
-; CHECK-NEXT:    clz.b64 %r3, %rd42;
-; CHECK-NEXT:    cvt.u64.u32 %rd51, %r3;
-; CHECK-NEXT:    clz.b64 %r4, %rd41;
-; CHECK-NEXT:    cvt.u64.u32 %rd52, %r4;
-; CHECK-NEXT:    add.s64 %rd53, %rd52, 64;
-; CHECK-NEXT:    selp.b64 %rd54, %rd51, %rd53, %p5;
-; CHECK-NEXT:    mov.b64 %rd103, 0;
-; CHECK-NEXT:    sub.cc.s64 %rd56, %rd50, %rd54;
-; CHECK-NEXT:    subc.cc.s64 %rd57, %rd103, 0;
-; CHECK-NEXT:    setp.gt.u64 %p6, %rd56, 127;
-; CHECK-NEXT:    setp.eq.b64 %p7, %rd57, 0;
+; CHECK-NEXT:    setp.ne.b64 %p4, %rd2, 0;
+; CHECK-NEXT:    clz.b64 %r1, %rd2;
+; CHECK-NEXT:    cvt.u64.u32 %rd9, %r1;
+; CHECK-NEXT:    clz.b64 %r2, %rd1;
+; CHECK-NEXT:    cvt.u64.u32 %rd10, %r2;
+; CHECK-NEXT:    add.s64 %rd11, %rd10, 64;
+; CHECK-NEXT:    selp.b64 %rd12, %rd9, %rd11, %p4;
+; CHECK-NEXT:    setp.ne.b64 %p5, %rd6, 0;
+; CHECK-NEXT:    clz.b64 %r3, %rd6;
+; CHECK-NEXT:    cvt.u64.u32 %rd13, %r3;
+; CHECK-NEXT:    clz.b64 %r4, %rd5;
+; CHECK-NEXT:    cvt.u64.u32 %rd14, %r4;
+; CHECK-NEXT:    add.s64 %rd15, %rd14, 64;
+; CHECK-NEXT:    selp.b64 %rd16, %rd13, %rd15, %p5;
+; CHECK-NEXT:    mov.b64 %rd57, 0;
+; CHECK-NEXT:    sub.cc.s64 %rd17, %rd12, %rd16;
+; CHECK-NEXT:    subc.cc.s64 %rd18, %rd57, 0;
+; CHECK-NEXT:    setp.gt.u64 %p6, %rd17, 127;
+; CHECK-NEXT:    setp.eq.b64 %p7, %rd18, 0;
 ; CHECK-NEXT:    and.pred %p8, %p7, %p6;
-; CHECK-NEXT:    setp.ne.b64 %p9, %rd57, 0;
+; CHECK-NEXT:    setp.ne.b64 %p9, %rd18, 0;
 ; CHECK-NEXT:    or.pred %p10, %p8, %p9;
 ; CHECK-NEXT:    or.pred %p11, %p3, %p10;
-; CHECK-NEXT:    xor.b64 %rd58, %rd56, 127;
-; CHECK-NEXT:    or.b64 %rd59, %rd58, %rd57;
-; CHECK-NEXT:    setp.eq.b64 %p12, %rd59, 0;
-; CHECK-NEXT:    selp.b64 %rd112, 0, %rd42, %p11;
-; CHECK-NEXT:    selp.b64 %rd111, 0, %rd41, %p11;
+; CHECK-NEXT:    xor.b64 %rd19, %rd17, 127;
+; CHECK-NEXT:    or.b64 %rd20, %rd19, %rd18;
+; CHECK-NEXT:    setp.eq.b64 %p12, %rd20, 0;
+; CHECK-NEXT:    selp.b64 %rd65, 0, %rd6, %p11;
+; CHECK-NEXT:    selp.b64 %rd64, 0, %rd5, %p11;
 ; CHECK-NEXT:    or.pred %p13, %p11, %p12;
 ; CHECK-NEXT:    @%p13 bra $L__BB1_5;
 ; CHECK-NEXT:  // %bb.3: // %udiv-bb1
-; CHECK-NEXT:    add.cc.s64 %rd105, %rd56, 1;
-; CHECK-NEXT:    addc.cc.s64 %rd106, %rd57, 0;
-; CHECK-NEXT:    or.b64 %rd62, %rd105, %rd106;
-; CHECK-NEXT:    setp.eq.b64 %p14, %rd62, 0;
-; CHECK-NEXT:    cvt.u32.u64 %r5, %rd56;
+; CHECK-NEXT:    add.cc.s64 %rd58, %rd17, 1;
+; CHECK-NEXT:    addc.cc.s64 %rd59, %rd18, 0;
+; CHECK-NEXT:    or.b64 %rd21, %rd58, %rd59;
+; CHECK-NEXT:    setp.eq.b64 %p14, %rd21, 0;
+; CHECK-NEXT:    cvt.u32.u64 %r5, %rd17;
 ; CHECK-NEXT:    sub.s32 %r6, 127, %r5;
-; CHECK-NEXT:    shl.b64 %rd63, %rd42, %r6;
+; CHECK-NEXT:    shl.b64 %rd22, %rd6, %r6;
 ; CHECK-NEXT:    sub.s32 %r7, 64, %r6;
-; CHECK-NEXT:    shr.u64 %rd64, %rd41, %r7;
-; CHECK-NEXT:    or.b64 %rd65, %rd63, %rd64;
+; CHECK-NEXT:    shr.u64 %rd23, %rd5, %r7;
+; CHECK-NEXT:    or.b64 %rd24, %rd22, %rd23;
 ; CHECK-NEXT:    sub.s32 %r8, 63, %r5;
-; CHECK-NEXT:    shl.b64 %rd66, %rd41, %r8;
+; CHECK-NEXT:    shl.b64 %rd25, %rd5, %r8;
 ; CHECK-NEXT:    setp.gt.s32 %p15, %r6, 63;
-; CHECK-NEXT:    selp.b64 %rd110, %rd66, %rd65, %p15;
-; CHECK-NEXT:    shl.b64 %rd109, %rd41, %r6;
-; CHECK-NEXT:    mov.b64 %rd100, %rd103;
+; CHECK-NEXT:    selp.b64 %rd63, %rd25, %rd24, %p15;
+; CHECK-NEXT:    shl.b64 %rd62, %rd5, %r6;
+; CHECK-NEXT:    mov.b64 %rd56, %rd57;
 ; CHECK-NEXT:    @%p14 bra $L__BB1_4;
 ; CHECK-NEXT:  // %bb.1: // %udiv-preheader
-; CHECK-NEXT:    cvt.u32.u64 %r9, %rd105;
-; CHECK-NEXT:    shr.u64 %rd69, %rd41, %r9;
+; CHECK-NEXT:    cvt.u32.u64 %r9, %rd58;
+; CHECK-NEXT:    shr.u64 %rd26, %rd5, %r9;
 ; CHECK-NEXT:    sub.s32 %r10, 64, %r9;
-; CHECK-NEXT:    shl.b64 %rd70, %rd42, %r10;
-; CHECK-NEXT:    or.b64 %rd71, %rd69, %rd70;
+; CHECK-NEXT:    shl.b64 %rd27, %rd6, %r10;
+; CHECK-NEXT:    or.b64 %rd28, %rd26, %rd27;
 ; CHECK-NEXT:    add.s32 %r11, %r9, -64;
-; CHECK-NEXT:    shr.u64 %rd72, %rd42, %r11;
+; CHECK-NEXT:    shr.u64 %rd29, %rd6, %r11;
 ; CHECK-NEXT:    setp.gt.s32 %p16, %r9, 63;
-; CHECK-NEXT:    selp.b64 %rd107, %rd72, %rd71, %p16;
-; CHECK-NEXT:    shr.u64 %rd108, %rd42, %r9;
-; CHECK-NEXT:    add.cc.s64 %rd33, %rd3, -1;
-; CHECK-NEXT:    addc.cc.s64 %rd34, %rd4, -1;
-; CHECK-NEXT:    mov.b64 %rd100, 0;
-; CHECK-NEXT:    mov.b64 %rd103, %rd100;
+; CHECK-NEXT:    selp.b64 %rd60, %rd29, %rd28, %p16;
+; CHECK-NEXT:    shr.u64 %rd61, %rd6, %r9;
+; CHECK-NEXT:    add.cc.s64 %rd3, %rd1, -1;
+; CHECK-NEXT:    addc.cc.s64 %rd4, %rd2, -1;
+; CHECK-NEXT:    mov.b64 %rd56, 0;
+; CHECK-NEXT:    mov.b64 %rd57, %rd56;
 ; CHECK-NEXT:  $L__BB1_2: // %udiv-do-while
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    shr.u64 %rd73, %rd107, 63;
-; CHECK-NEXT:    shl.b64 %rd74, %rd108, 1;
-; CHECK-NEXT:    or.b64 %rd75, %rd74, %rd73;
-; CHECK-NEXT:    shl.b64 %rd76, %rd107, 1;
-; CHECK-NEXT:    shr.u64 %rd77, %rd110, 63;
-; CHECK-NEXT:    or.b64 %rd78, %rd76, %rd77;
-; CHECK-NEXT:    shr.u64 %rd79, %rd109, 63;
-; CHECK-NEXT:    shl.b64 %rd80, %rd110, 1;
-; CHECK-NEXT:    or.b64 %rd81, %rd80, %rd79;
-; CHECK-NEXT:    shl.b64 %rd82, %rd109, 1;
-; CHECK-NEXT:    or.b64 %rd109, %rd103, %rd82;
-; CHECK-NEXT:    or.b64 %rd110, %rd100, %rd81;
-; CHECK-NEXT:    sub.cc.s64 %rd83, %rd33, %rd78;
-; CHECK-NEXT:    subc.cc.s64 %rd84, %rd34, %rd75;
-; CHECK-NEXT:    shr.s64 %rd85, %rd84, 63;
-; CHECK-NEXT:    and.b64 %rd103, %rd85, 1;
-; CHECK-NEXT:    and.b64 %rd86, %rd85, %rd3;
-; CHECK-NEXT:    and.b64 %rd87, %rd85, %rd4;
-; CHECK-NEXT:    sub.cc.s64 %rd107, %rd78, %rd86;
-; CHECK-NEXT:    subc.cc.s64 %rd108, %rd75, %rd87;
-; CHECK-NEXT:    add.cc.s64 %rd105, %rd105, -1;
-; CHECK-NEXT:    addc.cc.s64 %rd106, %rd106, -1;
-; CHECK-NEXT:    or.b64 %rd88, %rd105, %rd106;
-; CHECK-NEXT:    setp.eq.b64 %p17, %rd88, 0;
+; CHECK-NEXT:    shr.u64 %rd30, %rd60, 63;
+; CHECK-NEXT:    shl.b64 %rd31, %rd61, 1;
+; CHECK-NEXT:    or.b64 %rd32, %rd31, %rd30;
+; CHECK-NEXT:    shl.b64 %rd33, %rd60, 1;
+; CHECK-NEXT:    shr.u64 %rd34, %rd63, 63;
+; CHECK-NEXT:    or.b64 %rd35, %rd33, %rd34;
+; CHECK-NEXT:    shr.u64 %rd36, %rd62, 63;
+; CHECK-NEXT:    shl.b64 %rd37, %rd63, 1;
+; CHECK-NEXT:    or.b64 %rd38, %rd37, %rd36;
+; CHECK-NEXT:    shl.b64 %rd39, %rd62, 1;
+; CHECK-NEXT:    or.b64 %rd62, %rd57, %rd39;
+; CHECK-NEXT:    or.b64 %rd63, %rd56, %rd38;
+; CHECK-NEXT:    sub.cc.s64 %rd40, %rd3, %rd35;
+; CHECK-NEXT:    subc.cc.s64 %rd41, %rd4, %rd32;
+; CHECK-NEXT:    shr.s64 %rd42, %rd41, 63;
+; CHECK-NEXT:    and.b64 %rd57, %rd42, 1;
+; CHECK-NEXT:    and.b64 %rd43, %rd42, %rd1;
+; CHECK-NEXT:    and.b64 %rd44, %rd42, %rd2;
+; CHECK-NEXT:    sub.cc.s64 %rd60, %rd35, %rd43;
+; CHECK-NEXT:    subc.cc.s64 %rd61, %rd32, %rd44;
+; CHECK-NEXT:    add.cc.s64 %rd58, %rd58, -1;
+; CHECK-NEXT:    addc.cc.s64 %rd59, %rd59, -1;
+; CHECK-NEXT:    or.b64 %rd45, %rd58, %rd59;
+; CHECK-NEXT:    setp.eq.b64 %p17, %rd45, 0;
 ; CHECK-NEXT:    @%p17 bra $L__BB1_4;
 ; CHECK-NEXT:    bra.uni $L__BB1_2;
 ; CHECK-NEXT:  $L__BB1_4: // %udiv-loop-exit
-; CHECK-NEXT:    shr.u64 %rd89, %rd109, 63;
-; CHECK-NEXT:    shl.b64 %rd90, %rd110, 1;
-; CHECK-NEXT:    or.b64 %rd91, %rd90, %rd89;
-; CHECK-NEXT:    shl.b64 %rd92, %rd109, 1;
-; CHECK-NEXT:    or.b64 %rd111, %rd103, %rd92;
-; CHECK-NEXT:    or.b64 %rd112, %rd100, %rd91;
+; CHECK-NEXT:    shr.u64 %rd46, %rd62, 63;
+; CHECK-NEXT:    shl.b64 %rd47, %rd63, 1;
+; CHECK-NEXT:    or.b64 %rd48, %rd47, %rd46;
+; CHECK-NEXT:    shl.b64 %rd49, %rd62, 1;
+; CHECK-NEXT:    or.b64 %rd64, %rd57, %rd49;
+; CHECK-NEXT:    or.b64 %rd65, %rd56, %rd48;
 ; CHECK-NEXT:  $L__BB1_5: // %udiv-end
-; CHECK-NEXT:    mul.hi.u64 %rd93, %rd3, %rd111;
-; CHECK-NEXT:    mad.lo.s64 %rd94, %rd3, %rd112, %rd93;
-; CHECK-NEXT:    mad.lo.s64 %rd95, %rd4, %rd111, %rd94;
-; CHECK-NEXT:    mul.lo.s64 %rd96, %rd3, %rd111;
-; CHECK-NEXT:    sub.cc.s64 %rd97, %rd41, %rd96;
-; CHECK-NEXT:    subc.cc.s64 %rd98, %rd42, %rd95;
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd97, %rd98};
+; CHECK-NEXT:    mul.hi.u64 %rd50, %rd1, %rd64;
+; CHECK-NEXT:    mad.lo.s64 %rd51, %rd1, %rd65, %rd50;
+; CHECK-NEXT:    mad.lo.s64 %rd52, %rd2, %rd64, %rd51;
+; CHECK-NEXT:    mul.lo.s64 %rd53, %rd1, %rd64;
+; CHECK-NEXT:    sub.cc.s64 %rd54, %rd5, %rd53;
+; CHECK-NEXT:    subc.cc.s64 %rd55, %rd6, %rd52;
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd54, %rd55};
 ; CHECK-NEXT:    ret;
   %div = urem i128 %lhs, %rhs
   ret i128 %div
@@ -308,132 +308,132 @@ define i128 @sdiv_i128(i128 %lhs, i128 %rhs) {
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<20>;
 ; CHECK-NEXT:    .reg .b32 %r<12>;
-; CHECK-NEXT:    .reg .b64 %rd<122>;
+; CHECK-NEXT:    .reg .b64 %rd<74>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %_udiv-special-cases
-; CHECK-NEXT:    ld.param.v2.b64 {%rd45, %rd46}, [sdiv_i128_param_0];
-; CHECK-NEXT:    ld.param.v2.b64 {%rd49, %rd50}, [sdiv_i128_param_1];
-; CHECK-NEXT:    sub.cc.s64 %rd51, 0, %rd45;
-; CHECK-NEXT:    subc.cc.s64 %rd52, 0, %rd46;
-; CHECK-NEXT:    setp.lt.s64 %p1, %rd46, 0;
-; CHECK-NEXT:    selp.b64 %rd2, %rd52, %rd46, %p1;
-; CHECK-NEXT:    selp.b64 %rd1, %rd51, %rd45, %p1;
-; CHECK-NEXT:    sub.cc.s64 %rd53, 0, %rd49;
-; CHECK-NEXT:    subc.cc.s64 %rd54, 0, %rd50;
-; CHECK-NEXT:    setp.lt.s64 %p2, %rd50, 0;
-; CHECK-NEXT:    selp.b64 %rd4, %rd54, %rd50, %p2;
-; CHECK-NEXT:    selp.b64 %rd3, %rd53, %rd49, %p2;
-; CHECK-NEXT:    xor.b64 %rd55, %rd50, %rd46;
-; CHECK-NEXT:    shr.s64 %rd5, %rd55, 63;
-; CHECK-NEXT:    or.b64 %rd56, %rd3, %rd4;
-; CHECK-NEXT:    setp.eq.b64 %p3, %rd56, 0;
-; CHECK-NEXT:    or.b64 %rd57, %rd1, %rd2;
-; CHECK-NEXT:    setp.eq.b64 %p4, %rd57, 0;
+; CHECK-NEXT:    ld.param.v2.b64 {%rd8, %rd9}, [sdiv_i128_param_0];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd10, %rd11}, [sdiv_i128_param_1];
+; CHECK-NEXT:    sub.cc.s64 %rd12, 0, %rd8;
+; CHECK-NEXT:    subc.cc.s64 %rd13, 0, %rd9;
+; CHECK-NEXT:    setp.lt.s64 %p1, %rd9, 0;
+; CHECK-NEXT:    selp.b64 %rd2, %rd13, %rd9, %p1;
+; CHECK-NEXT:    selp.b64 %rd1, %rd12, %rd8, %p1;
+; CHECK-NEXT:    sub.cc.s64 %rd14, 0, %rd10;
+; CHECK-NEXT:    subc.cc.s64 %rd15, 0, %rd11;
+; CHECK-NEXT:    setp.lt.s64 %p2, %rd11, 0;
+; CHECK-NEXT:    selp.b64 %rd4, %rd15, %rd11, %p2;
+; CHECK-NEXT:    selp.b64 %rd3, %rd14, %rd10, %p2;
+; CHECK-NEXT:    xor.b64 %rd16, %rd11, %rd9;
+; CHECK-NEXT:    shr.s64 %rd5, %rd16, 63;
+; CHECK-NEXT:    or.b64 %rd17, %rd3, %rd4;
+; CHECK-NEXT:    setp.eq.b64 %p3, %rd17, 0;
+; CHECK-NEXT:    or.b64 %rd18, %rd1, %rd2;
+; CHECK-NEXT:    setp.eq.b64 %p4, %rd18, 0;
 ; CHECK-NEXT:    or.pred %p5, %p3, %p4;
 ; CHECK-NEXT:    setp.ne.b64 %p6, %rd4, 0;
 ; CHECK-NEXT:    clz.b64 %r1, %rd4;
-; CHECK-NEXT:    cvt.u64.u32 %rd58, %r1;
+; CHECK-NEXT:    cvt.u64.u32 %rd19, %r1;
 ; CHECK-NEXT:    clz.b64 %r2, %rd3;
-; CHECK-NEXT:    cvt.u64.u32 %rd59, %r2;
-; CHECK-NEXT:    add.s64 %rd60, %rd59, 64;
-; CHECK-NEXT:    selp.b64 %rd61, %rd58, %rd60, %p6;
+; CHECK-NEXT:    cvt.u64.u32 %rd20, %r2;
+; CHECK-NEXT:    add.s64 %rd21, %rd20, 64;
+; CHECK-NEXT:    selp.b64 %rd22, %rd19, %rd21, %p6;
 ; CHECK-NEXT:    setp.ne.b64 %p7, %rd2, 0;
 ; CHECK-NEXT:    clz.b64 %r3, %rd2;
-; CHECK-NEXT:    cvt.u64.u32 %rd62, %r3;
+; CHECK-NEXT:    cvt.u64.u32 %rd23, %r3;
 ; CHECK-NEXT:    clz.b64 %r4, %rd1;
-; CHECK-NEXT:    cvt.u64.u32 %rd63, %r4;
-; CHECK-NEXT:    add.s64 %rd64, %rd63, 64;
-; CHECK-NEXT:    selp.b64 %rd65, %rd62, %rd64, %p7;
-; CHECK-NEXT:    mov.b64 %rd112, 0;
-; CHECK-NEXT:    sub.cc.s64 %rd67, %rd61, %rd65;
-; CHECK-NEXT:    subc.cc.s64 %rd68, %rd112, 0;
-; CHECK-NEXT:    setp.gt.u64 %p8, %rd67, 127;
-; CHECK-NEXT:    setp.eq.b64 %p9, %rd68, 0;
+; CHECK-NEXT:    cvt.u64.u32 %rd24, %r4;
+; CHECK-NEXT:    add.s64 %rd25, %rd24, 64;
+; CHECK-NEXT:    selp.b64 %rd26, %rd23, %rd25, %p7;
+; CHECK-NEXT:    mov.b64 %rd65, 0;
+; CHECK-NEXT:    sub.cc.s64 %rd27, %rd22, %rd26;
+; CHECK-NEXT:    subc.cc.s64 %rd28, %rd65, 0;
+; CHECK-NEXT:    setp.gt.u64 %p8, %rd27, 127;
+; CHECK-NEXT:    setp.eq.b64 %p9, %rd28, 0;
 ; CHECK-NEXT:    and.pred %p10, %p9, %p8;
-; CHECK-NEXT:    setp.ne.b64 %p11, %rd68, 0;
+; CHECK-NEXT:    setp.ne.b64 %p11, %rd28, 0;
 ; CHECK-NEXT:    or.pred %p12, %p10, %p11;
 ; CHECK-NEXT:    or.pred %p13, %p5, %p12;
-; CHECK-NEXT:    xor.b64 %rd69, %rd67, 127;
-; CHECK-NEXT:    or.b64 %rd70, %rd69, %rd68;
-; CHECK-NEXT:    setp.eq.b64 %p14, %rd70, 0;
-; CHECK-NEXT:    selp.b64 %rd121, 0, %rd2, %p13;
-; CHECK-NEXT:    selp.b64 %rd120, 0, %rd1, %p13;
+; CHECK-NEXT:    xor.b64 %rd29, %rd27, 127;
+; CHECK-NEXT:    or.b64 %rd30, %rd29, %rd28;
+; CHECK-NEXT:    setp.eq.b64 %p14, %rd30, 0;
+; CHECK-NEXT:    selp.b64 %rd73, 0, %rd2, %p13;
+; CHECK-NEXT:    selp.b64 %rd72, 0, %rd1, %p13;
 ; CHECK-NEXT:    or.pred %p15, %p13, %p14;
 ; CHECK-NEXT:    @%p15 bra $L__BB4_5;
 ; CHECK-NEXT:  // %bb.3: // %udiv-bb1
-; CHECK-NEXT:    add.cc.s64 %rd114, %rd67, 1;
-; CHECK-NEXT:    addc.cc.s64 %rd115, %rd68, 0;
-; CHECK-NEXT:    or.b64 %rd73, %rd114, %rd115;
-; CHECK-NEXT:    setp.eq.b64 %p16, %rd73, 0;
-; CHECK-NEXT:    cvt.u32.u64 %r5, %rd67;
+; CHECK-NEXT:    add.cc.s64 %rd66, %rd27, 1;
+; CHECK-NEXT:    addc.cc.s64 %rd67, %rd28, 0;
+; CHECK-NEXT:    or.b64 %rd31, %rd66, %rd67;
+; CHECK-NEXT:    setp.eq.b64 %p16, %rd31, 0;
+; CHECK-NEXT:    cvt.u32.u64 %r5, %rd27;
 ; CHECK-NEXT:    sub.s32 %r6, 127, %r5;
-; CHECK-NEXT:    shl.b64 %rd74, %rd2, %r6;
+; CHECK-NEXT:    shl.b64 %rd32, %rd2, %r6;
 ; CHECK-NEXT:    sub.s32 %r7, 64, %r6;
-; CHECK-NEXT:    shr.u64 %rd75, %rd1, %r7;
-; CHECK-NEXT:    or.b64 %rd76, %rd74, %rd75;
+; CHECK-NEXT:    shr.u64 %rd33, %rd1, %r7;
+; CHECK-NEXT:    or.b64 %rd34, %rd32, %rd33;
 ; CHECK-NEXT:    sub.s32 %r8, 63, %r5;
-; CHECK-NEXT:    shl.b64 %rd77, %rd1, %r8;
+; CHECK-NEXT:    shl.b64 %rd35, %rd1, %r8;
 ; CHECK-NEXT:    setp.gt.s32 %p17, %r6, 63;
-; CHECK-NEXT:    selp.b64 %rd119, %rd77, %rd76, %p17;
-; CHECK-NEXT:    shl.b64 %rd118, %rd1, %r6;
-; CHECK-NEXT:    mov.b64 %rd109, %rd112;
+; CHECK-NEXT:    selp.b64 %rd71, %rd35, %rd34, %p17;
+; CHECK-NEXT:    shl.b64 %rd70, %rd1, %r6;
+; CHECK-NEXT:    mov.b64 %rd64, %rd65;
 ; CHECK-NEXT:    @%p16 bra $L__BB4_4;
 ; CHECK-NEXT:  // %bb.1: // %udiv-preheader
-; CHECK-NEXT:    cvt.u32.u64 %r9, %rd114;
-; CHECK-NEXT:    shr.u64 %rd80, %rd1, %r9;
+; CHECK-NEXT:    cvt.u32.u64 %r9, %rd66;
+; CHECK-NEXT:    shr.u64 %rd36, %rd1, %r9;
 ; CHECK-NEXT:    sub.s32 %r10, 64, %r9;
-; CHECK-NEXT:    shl.b64 %rd81, %rd2, %r10;
-; CHECK-NEXT:    or.b64 %rd82, %rd80, %rd81;
+; CHECK-NEXT:    shl.b64 %rd37, %rd2, %r10;
+; CHECK-NEXT:    or.b64 %rd38, %rd36, %rd37;
 ; CHECK-NEXT:    add.s32 %r11, %r9, -64;
-; CHECK-NEXT:    shr.u64 %rd83, %rd2, %r11;
+; CHECK-NEXT:    shr.u64 %rd39, %rd2, %r11;
 ; CHECK-NEXT:    setp.gt.s32 %p18, %r9, 63;
-; CHECK-NEXT:    selp.b64 %rd116, %rd83, %rd82, %p18;
-; CHECK-NEXT:    shr.u64 %rd117, %rd2, %r9;
-; CHECK-NEXT:    add.cc.s64 %rd35, %rd3, -1;
-; CHECK-NEXT:    addc.cc.s64 %rd36, %rd4, -1;
-; CHECK-NEXT:    mov.b64 %rd109, 0;
-; CHECK-NEXT:    mov.b64 %rd112, %rd109;
+; CHECK-NEXT:    selp.b64 %rd68, %rd39, %rd38, %p18;
+; CHECK-NEXT:    shr.u64 %rd69, %rd2, %r9;
+; CHECK-NEXT:    add.cc.s64 %rd6, %rd3, -1;
+; CHECK-NEXT:    addc.cc.s64 %rd7, %rd4, -1;
+; CHECK-NEXT:    mov.b64 %rd64, 0;
+; CHECK-NEXT:    mov.b64 %rd65, %rd64;
 ; CHECK-NEXT:  $L__BB4_2: // %udiv-do-while
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    shr.u64 %rd84, %rd116, 63;
-; CHECK-NEXT:    shl.b64 %rd85, %rd117, 1;
-; CHECK-NEXT:    or.b64 %rd86, %rd85, %rd84;
-; CHECK-NEXT:    shl.b64 %rd87, %rd116, 1;
-; CHECK-NEXT:    shr.u64 %rd88, %rd119, 63;
-; CHECK-NEXT:    or.b64 %rd89, %rd87, %rd88;
-; CHECK-NEXT:    shr.u64 %rd90, %rd118, 63;
-; CHECK-NEXT:    shl.b64 %rd91, %rd119, 1;
-; CHECK-NEXT:    or.b64 %rd92, %rd91, %rd90;
-; CHECK-NEXT:    shl.b64 %rd93, %rd118, 1;
-; CHECK-NEXT:    or.b64 %rd118, %rd112, %rd93;
-; CHECK-NEXT:    or.b64 %rd119, %rd109, %rd92;
-; CHECK-NEXT:    sub.cc.s64 %rd94, %rd35, %rd89;
-; CHECK-NEXT:    subc.cc.s64 %rd95, %rd36, %rd86;
-; CHECK-NEXT:    shr.s64 %rd96, %rd95, 63;
-; CHECK-NEXT:    and.b64 %rd112, %rd96, 1;
-; CHECK-NEXT:    and.b64 %rd97, %rd96, %rd3;
-; CHECK-NEXT:    and.b64 %rd98, %rd96, %rd4;
-; CHECK-NEXT:    sub.cc.s64 %rd116, %rd89, %rd97;
-; CHECK-NEXT:    subc.cc.s64 %rd117, %rd86, %rd98;
-; CHECK-NEXT:    add.cc.s64 %rd114, %rd114, -1;
-; CHECK-NEXT:    addc.cc.s64 %rd115, %rd115, -1;
-; CHECK-NEXT:    or.b64 %rd99, %rd114, %rd115;
-; CHECK-NEXT:    setp.eq.b64 %p19, %rd99, 0;
+; CHECK-NEXT:    shr.u64 %rd40, %rd68, 63;
+; CHECK-NEXT:    shl.b64 %rd41, %rd69, 1;
+; CHECK-NEXT:    or.b64 %rd42, %rd41, %rd40;
+; CHECK-NEXT:    shl.b64 %rd43, %rd68, 1;
+; CHECK-NEXT:    shr.u64 %rd44, %rd71, 63;
+; CHECK-NEXT:    or.b64 %rd45, %rd43, %rd44;
+; CHECK-NEXT:    shr.u64 %rd46, %rd70, 63;
+; CHECK-NEXT:    shl.b64 %rd47, %rd71, 1;
+; CHECK-NEXT:    or.b64 %rd48, %rd47, %rd46;
+; CHECK-NEXT:    shl.b64 %rd49, %rd70, 1;
+; CHECK-NEXT:    or.b64 %rd70, %rd65, %rd49;
+; CHECK-NEXT:    or.b64 %rd71, %rd64, %rd48;
+; CHECK-NEXT:    sub.cc.s64 %rd50, %rd6, %rd45;
+; CHECK-NEXT:    subc.cc.s64 %rd51, %rd7, %rd42;
+; CHECK-NEXT:    shr.s64 %rd52, %rd51, 63;
+; CHECK-NEXT:    and.b64 %rd65, %rd52, 1;
+; CHECK-NEXT:    and.b64 %rd53, %rd52, %rd3;
+; CHECK-NEXT:    and.b64 %rd54, %rd52, %rd4;
+; CHECK-NEXT:    sub.cc.s64 %rd68, %rd45, %rd53;
+; CHECK-NEXT:    subc.cc.s64 %rd69, %rd42, %rd54;
+; CHECK-NEXT:    add.cc.s64 %rd66, %rd66, -1;
+; CHECK-NEXT:    addc.cc.s64 %rd67, %rd67, -1;
+; CHECK-NEXT:    or.b64 %rd55, %rd66, %rd67;
+; CHECK-NEXT:    setp.eq.b64 %p19, %rd55, 0;
 ; CHECK-NEXT:    @%p19 bra $L__BB4_4;
 ; CHECK-NEXT:    bra.uni $L__BB4_2;
 ; CHECK-NEXT:  $L__BB4_4: // %udiv-loop-exit
-; CHECK-NEXT:    shr.u64 %rd100, %rd118, 63;
-; CHECK-NEXT:    shl.b64 %rd101, %rd119, 1;
-; CHECK-NEXT:    or.b64 %rd102, %rd101, %rd100;
-; CHECK-NEXT:    shl.b64 %rd103, %rd118, 1;
-; CHECK-NEXT:    or.b64 %rd120, %rd112, %rd103;
-; CHECK-NEXT:    or.b64 %rd121, %rd109, %rd102;
+; CHECK-NEXT:    shr.u64 %rd56, %rd70, 63;
+; CHECK-NEXT:    shl.b64 %rd57, %rd71, 1;
+; CHECK-NEXT:    or.b64 %rd58, %rd57, %rd56;
+; CHECK-NEXT:    shl.b64 %rd59, %rd70, 1;
+; CHECK-NEXT:    or.b64 %rd72, %rd65, %rd59;
+; CHECK-NEXT:    or.b64 %rd73, %rd64, %rd58;
 ; CHECK-NEXT:  $L__BB4_5: // %udiv-end
-; CHECK-NEXT:    xor.b64 %rd104, %rd120, %rd5;
-; CHECK-NEXT:    xor.b64 %rd105, %rd121, %rd5;
-; CHECK-NEXT:    sub.cc.s64 %rd106, %rd104, %rd5;
-; CHECK-NEXT:    subc.cc.s64 %rd107, %rd105, %rd5;
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd106, %rd107};
+; CHECK-NEXT:    xor.b64 %rd60, %rd72, %rd5;
+; CHECK-NEXT:    xor.b64 %rd61, %rd73, %rd5;
+; CHECK-NEXT:    sub.cc.s64 %rd62, %rd60, %rd5;
+; CHECK-NEXT:    subc.cc.s64 %rd63, %rd61, %rd5;
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd62, %rd63};
 ; CHECK-NEXT:    ret;
   %div = sdiv i128 %lhs, %rhs
   ret i128 %div
@@ -444,116 +444,116 @@ define i128 @udiv_i128(i128 %lhs, i128 %rhs) {
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<18>;
 ; CHECK-NEXT:    .reg .b32 %r<12>;
-; CHECK-NEXT:    .reg .b64 %rd<107>;
+; CHECK-NEXT:    .reg .b64 %rd<60>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %_udiv-special-cases
-; CHECK-NEXT:    ld.param.v2.b64 {%rd41, %rd42}, [udiv_i128_param_0];
-; CHECK-NEXT:    ld.param.v2.b64 {%rd43, %rd44}, [udiv_i128_param_1];
-; CHECK-NEXT:    or.b64 %rd45, %rd43, %rd44;
-; CHECK-NEXT:    setp.eq.b64 %p1, %rd45, 0;
-; CHECK-NEXT:    or.b64 %rd46, %rd41, %rd42;
-; CHECK-NEXT:    setp.eq.b64 %p2, %rd46, 0;
+; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [udiv_i128_param_0];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd5, %rd6}, [udiv_i128_param_1];
+; CHECK-NEXT:    or.b64 %rd7, %rd5, %rd6;
+; CHECK-NEXT:    setp.eq.b64 %p1, %rd7, 0;
+; CHECK-NEXT:    or.b64 %rd8, %rd3, %rd4;
+; CHECK-NEXT:    setp.eq.b64 %p2, %rd8, 0;
 ; CHECK-NEXT:    or.pred %p3, %p1, %p2;
-; CHECK-NEXT:    setp.ne.b64 %p4, %rd44, 0;
-; CHECK-NEXT:    clz.b64 %r1, %rd44;
-; CHECK-NEXT:    cvt.u64.u32 %rd47, %r1;
-; CHECK-NEXT:    clz.b64 %r2, %rd43;
-; CHECK-NEXT:    cvt.u64.u32 %rd48, %r2;
-; CHECK-NEXT:    add.s64 %rd49, %rd48, 64;
-; CHECK-NEXT:    selp.b64 %rd50, %rd47, %rd49, %p4;
-; CHECK-NEXT:    setp.ne.b64 %p5, %rd42, 0;
-; CHECK-NEXT:    clz.b64 %r3, %rd42;
-; CHECK-NEXT:    cvt.u64.u32 %rd51, %r3;
-; CHECK-NEXT:    clz.b64 %r4, %rd41;
-; CHECK-NEXT:    cvt.u64.u32 %rd52, %r4;
-; CHECK-NEXT:    add.s64 %rd53, %rd52, 64;
-; CHECK-NEXT:    selp.b64 %rd54, %rd51, %rd53, %p5;
-; CHECK-NEXT:    mov.b64 %rd97, 0;
-; CHECK-NEXT:    sub.cc.s64 %rd56, %rd50, %rd54;
-; CHECK-NEXT:    subc.cc.s64 %rd57, %rd97, 0;
-; CHECK-NEXT:    setp.gt.u64 %p6, %rd56, 127;
-; CHECK-NEXT:    setp.eq.b64 %p7, %rd57, 0;
+; CHECK-NEXT:    setp.ne.b64 %p4, %rd6, 0;
+; CHECK-NEXT:    clz.b64 %r1, %rd6;
+; CHECK-NEXT:    cvt.u64.u32 %rd9, %r1;
+; CHECK-NEXT:    clz.b64 %r2, %rd5;
+; CHECK-NEXT:    cvt.u64.u32 %rd10, %r2;
+; CHECK-NEXT:    add.s64 %rd11, %rd10, 64;
+; CHECK-NEXT:    selp.b64 %rd12, %rd9, %rd11, %p4;
+; CHECK-NEXT:    setp.ne.b64 %p5, %rd4, 0;
+; CHECK-NEXT:    clz.b64 %r3, %rd4;
+; CHECK-NEXT:    cvt.u64.u32 %rd13, %r3;
+; CHECK-NEXT:    clz.b64 %r4, %rd3;
+; CHECK-NEXT:    cvt.u64.u32 %rd14, %r4;
+; CHECK-NEXT:    add.s64 %rd15, %rd14, 64;
+; CHECK-NEXT:    selp.b64 %rd16, %rd13, %rd15, %p5;
+; CHECK-NEXT:    mov.b64 %rd51, 0;
+; CHECK-NEXT:    sub.cc.s64 %rd17, %rd12, %rd16;
+; CHECK-NEXT:    subc.cc.s64 %rd18, %rd51, 0;
+; CHECK-NEXT:    setp.gt.u64 %p6, %rd17, 127;
+; CHECK-NEXT:    setp.eq.b64 %p7, %rd18, 0;
 ; CHECK-NEXT:    and.pred %p8, %p7, %p6;
-; CHECK-NEXT:    setp.ne.b64 %p9, %rd57, 0;
+; CHECK-NEXT:    setp.ne.b64 %p9, %rd18, 0;
 ; CHECK-NEXT:    or.pred %p10, %p8, %p9;
 ; CHECK-NEXT:    or.pred %p11, %p3, %p10;
-; CHECK-NEXT:    xor.b64 %rd58, %rd56, 127;
-; CHECK-NEXT:    or.b64 %rd59, %rd58, %rd57;
-; CHECK-NEXT:    setp.eq.b64 %p12, %rd59, 0;
-; CHECK-NEXT:    selp.b64 %rd106, 0, %rd42, %p11;
-; CHECK-NEXT:    selp.b64 %rd105, 0, %rd41, %p11;
+; CHECK-NEXT:    xor.b64 %rd19, %rd17, 127;
+; CHECK-NEXT:    or.b64 %rd20, %rd19, %rd18;
+; CHECK-NEXT:    setp.eq.b64 %p12, %rd20, 0;
+; CHECK-NEXT:    selp.b64 %rd59, 0, %rd4, %p11;
+; CHECK-NEXT:    selp.b64 %rd58, 0, %rd3, %p11;
 ; CHECK-NEXT:    or.pred %p13, %p11, %p12;
 ; CHECK-NEXT:    @%p13 bra $L__BB5_5;
 ; CHECK-NEXT:  // %bb.3: // %udiv-bb1
-; CHECK-NEXT:    add.cc.s64 %rd99, %rd56, 1;
-; CHECK-NEXT:    addc.cc.s64 %rd100, %rd57, 0;
-; CHECK-NEXT:    or.b64 %rd62, %rd99, %rd100;
-; CHECK-NEXT:    setp.eq.b64 %p14, %rd62, 0;
-; CHECK-NEXT:    cvt.u32.u64 %r5, %rd56;
+; CHECK-NEXT:    add.cc.s64 %rd52, %rd17, 1;
+; CHECK-NEXT:    addc.cc.s64 %rd53, %rd18, 0;
+; CHECK-NEXT:    or.b64 %rd21, %rd52, %rd53;
+; CHECK-NEXT:    setp.eq.b64 %p14, %rd21, 0;
+; CHECK-NEXT:    cvt.u32.u64 %r5, %rd17;
 ; CHECK-NEXT:    sub.s32 %r6, 127, %r5;
-; CHECK-NEXT:    shl.b64 %rd63, %rd42, %r6;
+; CHECK-NEXT:    shl.b64 %rd22, %rd4, %r6;
 ; CHECK-NEXT:    sub.s32 %r7, 64, %r6;
-; CHECK-NEXT:    shr.u64 %rd64, %rd41, %r7;
-; CHECK-NEXT:    or.b64 %rd65, %rd63, %rd64;
+; CHECK-NEXT:    shr.u64 %rd23, %rd3, %r7;
+; CHECK-NEXT:    or.b64 %rd24, %rd22, %rd23;
 ; CHECK-NEXT:    sub.s32 %r8, 63, %r5;
-; CHECK-NEXT:    shl.b64 %rd66, %rd41, %r8;
+; CHECK-NEXT:    shl.b64 %rd25, %rd3, %r8;
 ; CHECK-NEXT:    setp.gt.s32 %p15, %r6, 63;
-; CHECK-NEXT:    selp.b64 %rd104, %rd66, %rd65, %p15;
-; CHECK-NEXT:    shl.b64 %rd103, %rd41, %r6;
-; CHECK-NEXT:    mov.b64 %rd94, %rd97;
+; CHECK-NEXT:    selp.b64 %rd57, %rd25, %rd24, %p15;
+; CHECK-NEXT:    shl.b64 %rd56, %rd3, %r6;
+; CHECK-NEXT:    mov.b64 %rd50, %rd51;
 ; CHECK-NEXT:    @%p14 bra $L__BB5_4;
 ; CHECK-NEXT:  // %bb.1: // %udiv-preheader
-; CHECK-NEXT:    cvt.u32.u64 %r9, %rd99;
-; CHECK-NEXT:    shr.u64 %rd69, %rd41, %r9;
+; CHECK-NEXT:    cvt.u32.u64 %r9, %rd52;
+; CHECK-NEXT:    shr.u64 %rd26, %rd3, %r9;
 ; CHECK-NEXT:    sub.s32 %r10, 64, %r9;
-; CHECK-NEXT:    shl.b64 %rd70, %rd42, %r10;
-; CHECK-NEXT:    or.b64 %rd71, %rd69, %rd70;
+; CHECK-NEXT:    shl.b64 %rd27, %rd4, %r10;
+; CHECK-NEXT:    or.b64 %rd28, %rd26, %rd27;
 ; CHECK-NEXT:    add.s32 %r11, %r9, -64;
-; CHECK-NEXT:    shr.u64 %rd72, %rd42, %r11;
+; CHECK-NEXT:    shr.u64 %rd29, %rd4, %r11;
 ; CHECK-NEXT:    setp.gt.s32 %p16, %r9, 63;
-; CHECK-NEXT:    selp.b64 %rd101, %rd72, %rd71, %p16;
-; CHECK-NEXT:    shr.u64 %rd102, %rd42, %r9;
-; CHECK-NEXT:    add.cc.s64 %rd33, %rd43, -1;
-; CHECK-NEXT:    addc.cc.s64 %rd34, %rd44, -1;
-; CHECK-NEXT:    mov.b64 %rd94, 0;
-; CHECK-NEXT:    mov.b64 %rd97, %rd94;
+; CHECK-NEXT:    selp.b64 %rd54, %rd29, %rd28, %p16;
+; CHECK-NEXT:    shr.u64 %rd55, %rd4, %r9;
+; CHECK-NEXT:    add.cc.s64 %rd1, %rd5, -1;
+; CHECK-NEXT:    addc.cc.s64 %rd2, %rd6, -1;
+; CHECK-NEXT:    mov.b64 %rd50, 0;
+; CHECK-NEXT:    mov.b64 %rd51, %rd50;
 ; CHECK-NEXT:  $L__BB5_2: // %udiv-do-while
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    shr.u64 %rd73, %rd101, 63;
-; CHECK-NEXT:    shl.b64 %rd74, %rd102, 1;
-; CHECK-NEXT:    or.b64 %rd75, %rd74, %rd73;
-; CHECK-NEXT:    shl.b64 %rd76, %rd101, 1;
-; CHECK-NEXT:    shr.u64 %rd77, %rd104, 63;
-; CHECK-NEXT:    or.b64 %rd78, %rd76, %rd77;
-; CHECK-NEXT:    shr.u64 %rd79, %rd103, 63;
-; CHECK-NEXT:    shl.b64 %rd80, %rd104, 1;
-; CHECK-NEXT:    or.b64 %rd81, %rd80, %rd79;
-; CHECK-NEXT:    shl.b64 %rd82, %rd103, 1;
-; CHECK-NEXT:    or.b64 %rd103, %rd97, %rd82;
-; CHECK-NEXT:    or.b64 %rd104, %rd94, %rd81;
-; CHECK-NEXT:    sub.cc.s64 %rd83, %rd33, %rd78;
-; CHECK-NEXT:    subc.cc.s64 %rd84, %rd34, %rd75;
-; CHECK-NEXT:    shr.s64 %rd85, %rd84, 63;
-; CHECK-NEXT:    and.b64 %rd97, %rd85, 1;
-; CHECK-NEXT:    and.b64 %rd86, %rd85, %rd43;
-; CHECK-NEXT:    and.b64 %rd87, %rd85, %rd44;
-; CHECK-NEXT:    sub.cc.s64 %rd101, %rd78, %rd86;
-; CHECK-NEXT:    subc.cc.s64 %rd102, %rd75, %rd87;
-; CHECK-NEXT:    add.cc.s64 %rd99, %rd99, -1;
-; CHECK-NEXT:    addc.cc.s64 %rd100, %rd100, -1;
-; CHECK-NEXT:    or.b64 %rd88, %rd99, %rd100;
-; CHECK-NEXT:    setp.eq.b64 %p17, %rd88, 0;
+; CHECK-NEXT:    shr.u64 %rd30, %rd54, 63;
+; CHECK-NEXT:    shl.b64 %rd31, %rd55, 1;
+; CHECK-NEXT:    or.b64 %rd32, %rd31, %rd30;
+; CHECK-NEXT:    shl.b64 %rd33, %rd54, 1;
+; CHECK-NEXT:    shr.u64 %rd34, %rd57, 63;
+; CHECK-NEXT:    or.b64 %rd35, %rd33, %rd34;
+; CHECK-NEXT:    shr.u64 %rd36, %rd56, 63;
+; CHECK-NEXT:    shl.b64 %rd37, %rd57, 1;
+; CHECK-NEXT:    or.b64 %rd38, %rd37, %rd36;
+; CHECK-NEXT:    shl.b64 %rd39, %rd56, 1;
+; CHECK-NEXT:    or.b64 %rd56, %rd51, %rd39;
+; CHECK-NEXT:    or.b64 %rd57, %rd50, %rd38;
+; CHECK-NEXT:    sub.cc.s64 %rd40, %rd1, %rd35;
+; CHECK-NEXT:    subc.cc.s64 %rd41, %rd2, %rd32;
+; CHECK-NEXT:    shr.s64 %rd42, %rd41, 63;
+; CHECK-NEXT:    and.b64 %rd51, %rd42, 1;
+; CHECK-NEXT:    and.b64 %rd43, %rd42, %rd5;
+; CHECK-NEXT:    and.b64 %rd44, %rd42, %rd6;
+; CHECK-NEXT:    sub.cc.s64 %rd54, %rd35, %rd43;
+; CHECK-NEXT:    subc.cc.s64 %rd55, %rd32, %rd44;
+; CHECK-NEXT:    add.cc.s64 %rd52, %rd52, -1;
+; CHECK-NEXT:    addc.cc.s64 %rd53, %rd53, -1;
+; CHECK-NEXT:    or.b64 %rd45, %rd52, %rd53;
+; CHECK-NEXT:    setp.eq.b64 %p17, %rd45, 0;
 ; CHECK-NEXT:    @%p17 bra $L__BB5_4;
 ; CHECK-NEXT:    bra.uni $L__BB5_2;
 ; CHECK-NEXT:  $L__BB5_4: // %udiv-loop-exit
-; CHECK-NEXT:    shr.u64 %rd89, %rd103, 63;
-; CHECK-NEXT:    shl.b64 %rd90, %rd104, 1;
-; CHECK-NEXT:    or.b64 %rd91, %rd90, %rd89;
-; CHECK-NEXT:    shl.b64 %rd92, %rd103, 1;
-; CHECK-NEXT:    or.b64 %rd105, %rd97, %rd92;
-; CHECK-NEXT:    or.b64 %rd106, %rd94, %rd91;
+; CHECK-NEXT:    shr.u64 %rd46, %rd56, 63;
+; CHECK-NEXT:    shl.b64 %rd47, %rd57, 1;
+; CHECK-NEXT:    or.b64 %rd48, %rd47, %rd46;
+; CHECK-NEXT:    shl.b64 %rd49, %rd56, 1;
+; CHECK-NEXT:    or.b64 %rd58, %rd51, %rd49;
+; CHECK-NEXT:    or.b64 %rd59, %rd50, %rd48;
 ; CHECK-NEXT:  $L__BB5_5: // %udiv-end
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd105, %rd106};
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd58, %rd59};
 ; CHECK-NEXT:    ret;
   %div = udiv i128 %lhs, %rhs
   ret i128 %div
diff --git a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll
index 74136bbe478c9..7f48245af4a26 100644
--- a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll
@@ -635,7 +635,7 @@ declare <2 x i16> @test_callee(<2 x i16> %a, <2 x i16> %b) #0
 define <2 x i16> @test_call(<2 x i16> %a, <2 x i16> %b) #0 {
 ; COMMON-LABEL: test_call(
 ; COMMON:       {
-; COMMON-NEXT:    .reg .b32 %r<5>;
+; COMMON-NEXT:    .reg .b32 %r<4>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
 ; COMMON-NEXT:    ld.param.b32 %r2, [test_call_param_1];
@@ -658,7 +658,7 @@ define <2 x i16> @test_call(<2 x i16> %a, <2 x i16> %b) #0 {
 define <2 x i16> @test_call_flipped(<2 x i16> %a, <2 x i16> %b) #0 {
 ; COMMON-LABEL: test_call_flipped(
 ; COMMON:       {
-; COMMON-NEXT:    .reg .b32 %r<5>;
+; COMMON-NEXT:    .reg .b32 %r<4>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
 ; COMMON-NEXT:    ld.param.b32 %r2, [test_call_flipped_param_1];
@@ -681,7 +681,7 @@ define <2 x i16> @test_call_flipped(<2 x i16> %a, <2 x i16> %b) #0 {
 define <2 x i16> @test_tailcall_flipped(<2 x i16> %a, <2 x i16> %b) #0 {
 ; COMMON-LABEL: test_tailcall_flipped(
 ; COMMON:       {
-; COMMON-NEXT:    .reg .b32 %r<5>;
+; COMMON-NEXT:    .reg .b32 %r<4>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
 ; COMMON-NEXT:    ld.param.b32 %r2, [test_tailcall_flipped_param_1];
diff --git a/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll
index 98f94bb7b3ac1..53150c1a01314 100644
--- a/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll
@@ -69,7 +69,7 @@ define <2 x i8> @test_bitcast_i16_2xi8(i16 %a) {
 define <2 x i8> @test_call_2xi8(<2 x i8> %a) {
 ; O0-LABEL: test_call_2xi8(
 ; O0:       {
-; O0-NEXT:    .reg .b16 %rs<7>;
+; O0-NEXT:    .reg .b16 %rs<5>;
 ; O0-NEXT:    .reg .b32 %r<2>;
 ; O0-EMPTY:
 ; O0-NEXT:  // %bb.0:
@@ -87,7 +87,7 @@ define <2 x i8> @test_call_2xi8(<2 x i8> %a) {
 ;
 ; O3-LABEL: test_call_2xi8(
 ; O3:       {
-; O3-NEXT:    .reg .b16 %rs<7>;
+; O3-NEXT:    .reg .b16 %rs<5>;
 ; O3-EMPTY:
 ; O3-NEXT:  // %bb.0:
 ; O3-NEXT:    ld.param.v2.b8 {%rs1, %rs2}, [test_call_2xi8_param_0];
diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
index 26336b83c4f96..40d6a07310265 100644
--- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
@@ -1298,7 +1298,7 @@ declare <4 x i8> @test_callee(<4 x i8> %a, <4 x i8> %b) #0
 define <4 x i8> @test_call(<4 x i8> %a, <4 x i8> %b) #0 {
 ; O0-LABEL: test_call(
 ; O0:       {
-; O0-NEXT:    .reg .b32 %r<5>;
+; O0-NEXT:    .reg .b32 %r<4>;
 ; O0-EMPTY:
 ; O0-NEXT:  // %bb.0:
 ; O0-NEXT:    ld.param.b32 %r2, [test_call_param_1];
@@ -1317,7 +1317,7 @@ define <4 x i8> @test_call(<4 x i8> %a, <4 x i8> %b) #0 {
 ;
 ; O3-LABEL: test_call(
 ; O3:       {
-; O3-NEXT:    .reg .b32 %r<5>;
+; O3-NEXT:    .reg .b32 %r<4>;
 ; O3-EMPTY:
 ; O3-NEXT:  // %bb.0:
 ; O3-NEXT:    ld.param.b32 %r1, [test_call_param_0];
@@ -1340,7 +1340,7 @@ define <4 x i8> @test_call(<4 x i8> %a, <4 x i8> %b) #0 {
 define <4 x i8> @test_call_flipped(<4 x i8> %a, <4 x i8> %b) #0 {
 ; O0-LABEL: test_call_flipped(
 ; O0:       {
-; O0-NEXT:    .reg .b32 %r<5>;
+; O0-NEXT:    .reg .b32 %r<4>;
 ; O0-EMPTY:
 ; O0-NEXT:  // %bb.0:
 ; O0-NEXT:    ld.param.b32 %r2, [test_call_flipped_param_1];
@@ -1359,7 +1359,7 @@ define <4 x i8> @test_call_flipped(<4 x i8> %a, <4 x i8> %b) #0 {
 ;
 ; O3-LABEL: test_call_flipped(
 ; O3:       {
-; O3-NEXT:    .reg .b32 %r<5>;
+; O3-NEXT:    .reg .b32 %r<4>;
 ; O3-EMPTY:
 ; O3-NEXT:  // %bb.0:
 ; O3-NEXT:    ld.param.b32 %r1, [test_call_flipped_param_0];
@@ -1382,7 +1382,7 @@ define <4 x i8> @test_call_flipped(<4 x i8> %a, <4 x i8> %b) #0 {
 define <4 x i8> @test_tailcall_flipped(<4 x i8> %a, <4 x i8> %b) #0 {
 ; O0-LABEL: test_tailcall_flipped(
 ; O0:       {
-; O0-NEXT:    .reg .b32 %r<5>;
+; O0-NEXT:    .reg .b32 %r<4>;
 ; O0-EMPTY:
 ; O0-NEXT:  // %bb.0:
 ; O0-NEXT:    ld.param.b32 %r2, [test_tailcall_flipped_param_1];
@@ -1401,7 +1401,7 @@ define <4 x i8> @test_tailcall_flipped(<4 x i8> %a, <4 x i8> %b) #0 {
 ;
 ; O3-LABEL: test_tailcall_flipped(
 ; O3:       {
-; O3-NEXT:    .reg .b32 %r<5>;
+; O3-NEXT:    .reg .b32 %r<4>;
 ; O3-EMPTY:
 ; O3-NEXT:  // %bb.0:
 ; O3-NEXT:    ld.param.b32 %r1, [test_tailcall_flipped_param_0];
diff --git a/llvm/test/CodeGen/NVPTX/indirect_byval.ll b/llvm/test/CodeGen/NVPTX/indirect_byval.ll
index 782e6720e5112..673fb73948268 100644
--- a/llvm/test/CodeGen/NVPTX/indirect_byval.ll
+++ b/llvm/test/CodeGen/NVPTX/indirect_byval.ll
@@ -16,8 +16,8 @@ define internal i32 @foo() {
 ; CHECK-NEXT:    .reg .b64 %SP;
 ; CHECK-NEXT:    .reg .b64 %SPL;
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
-; CHECK-NEXT:    .reg .b32 %r<3>;
-; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    mov.b64 %SPL, __local_depot0;
@@ -29,8 +29,8 @@ define internal i32 @foo() {
 ; CHECK-NEXT:    .param .b32 retval0;
 ; CHECK-NEXT:    add.u64 %rd2, %SP, 0;
 ; CHECK-NEXT:    st.param.b64 [param1], %rd2;
-; CHECK-NEXT:    add.u64 %rd4, %SPL, 1;
-; CHECK-NEXT:    ld.local.b8 %rs1, [%rd4];
+; CHECK-NEXT:    add.u64 %rd3, %SPL, 1;
+; CHECK-NEXT:    ld.local.b8 %rs1, [%rd3];
 ; CHECK-NEXT:    st.param.b8 [param0], %rs1;
 ; CHECK-NEXT:    prototype_0 : .callprototype (.param .b32 _) _ (.param .align 1 .b8 _[1], .param .b64 _);
 ; CHECK-NEXT:    call (retval0), %rd1, (param0, param1), prototype_0;
@@ -53,8 +53,8 @@ define internal i32 @bar() {
 ; CHECK-NEXT:    .local .align 8 .b8 __local_depot1[16];
 ; CHECK-NEXT:    .reg .b64 %SP;
 ; CHECK-NEXT:    .reg .b64 %SPL;
-; CHECK-NEXT:    .reg .b32 %r<3>;
-; CHECK-NEXT:    .reg .b64 %rd<6>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    mov.b64 %SPL, __local_depot1;
@@ -66,9 +66,9 @@ define internal i32 @bar() {
 ; CHECK-NEXT:    .param .b32 retval0;
 ; CHECK-NEXT:    add.u64 %rd2, %SP, 0;
 ; CHECK-NEXT:    st.param.b64 [param1], %rd2;
-; CHECK-NEXT:    add.u64 %rd4, %SPL, 8;
-; CHECK-NEXT:    ld.local.b64 %rd5, [%rd4];
-; CHECK-NEXT:    st.param.b64 [param0], %rd5;
+; CHECK-NEXT:    add.u64 %rd3, %SPL, 8;
+; CHECK-NEXT:    ld.local.b64 %rd4, [%rd3];
+; CHECK-NEXT:    st.param.b64 [param0], %rd4;
 ; CHECK-NEXT:    prototype_1 : .callprototype (.param .b32 _) _ (.param .align 8 .b8 _[8], .param .b64 _);
 ; CHECK-NEXT:    call (retval0), %rd1, (param0, param1), prototype_1;
 ; CHECK-NEXT:    ld.param.b32 %r1, [retval0];
diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll
index 037d7df1aee59..bf0dd58e27a35 100644
--- a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll
+++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll
@@ -11,32 +11,32 @@ define void @test_b128_in_loop() {
 ; CHECK-LABEL: test_b128_in_loop(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<3>;
-; CHECK-NEXT:    .reg .b64 %rd<15>;
-; CHECK-NEXT:    .reg .b128 %rq<3>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-NEXT:    .reg .b128 %rq<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.global.s32 %rd1, [size];
 ; CHECK-NEXT:    setp.eq.b64 %p1, %rd1, 0;
 ; CHECK-NEXT:    @%p1 bra $L__BB0_3;
 ; CHECK-NEXT:  // %bb.1: // %BB1
-; CHECK-NEXT:    ld.global.v2.b64 {%rd12, %rd13}, [x];
-; CHECK-NEXT:    mov.b64 %rd14, 0;
+; CHECK-NEXT:    ld.global.v2.b64 {%rd2, %rd3}, [x];
+; CHECK-NEXT:    mov.b64 %rd4, 0;
 ; CHECK-NEXT:  $L__BB0_2: // %BB2
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    mov.b128 %rq1, {%rd12, %rd13};
+; CHECK-NEXT:    mov.b128 %rq1, {%rd2, %rd3};
 ; CHECK-NEXT:    // begin inline asm
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:    .reg .b64 lo;
 ; CHECK-NEXT:    .reg .b64 hi;
 ; CHECK-NEXT:    mov.b128 {lo, hi}, %rq1;
-; CHECK-NEXT:    add.cc.u64 lo, lo, %rd14;
+; CHECK-NEXT:    add.cc.u64 lo, lo, %rd4;
 ; CHECK-NEXT:    mov.b128 %rq1, {lo, hi};
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    // end inline asm
-; CHECK-NEXT:    mov.b128 {%rd12, %rd13}, %rq1;
-; CHECK-NEXT:    st.global.v2.b64 [x], {%rd12, %rd13};
-; CHECK-NEXT:    add.s64 %rd14, %rd14, 1;
-; CHECK-NEXT:    setp.ne.b64 %p2, %rd1, %rd14;
+; CHECK-NEXT:    mov.b128 {%rd2, %rd3}, %rq1;
+; CHECK-NEXT:    st.global.v2.b64 [x], {%rd2, %rd3};
+; CHECK-NEXT:    add.s64 %rd4, %rd4, 1;
+; CHECK-NEXT:    setp.ne.b64 %p2, %rd1, %rd4;
 ; CHECK-NEXT:    @%p2 bra $L__BB0_2;
 ; CHECK-NEXT:  $L__BB0_3: // %BB3
 ; CHECK-NEXT:    ret;
diff --git a/llvm/test/CodeGen/NVPTX/jump-table.ll b/llvm/test/CodeGen/NVPTX/jump-table.ll
index a6238352179ca..4620c5e01008c 100644
--- a/llvm/test/CodeGen/NVPTX/jump-table.ll
+++ b/llvm/test/CodeGen/NVPTX/jump-table.ll
@@ -10,11 +10,11 @@ define void @foo(i32 %i) {
 ; CHECK-LABEL: foo(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<2>;
-; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    ld.param.b32 %r2, [foo_param_0];
-; CHECK-NEXT:    setp.gt.u32 %p1, %r2, 3;
+; CHECK-NEXT:    ld.param.b32 %r1, [foo_param_0];
+; CHECK-NEXT:    setp.gt.u32 %p1, %r1, 3;
 ; CHECK-NEXT:    @%p1 bra $L__BB0_6;
 ; CHECK-NEXT:  // %bb.1: // %entry
 ; CHECK-NEXT:    $L_brx_0: .branchtargets
@@ -22,7 +22,7 @@ define void @foo(i32 %i) {
 ; CHECK-NEXT:     $L__BB0_3,
 ; CHECK-NEXT:     $L__BB0_4,
 ; CHECK-NEXT:     $L__BB0_5;
-; CHECK-NEXT:    brx.idx %r2, $L_brx_0;
+; CHECK-NEXT:    brx.idx %r1, $L_brx_0;
 ; CHECK-NEXT:  $L__BB0_2: // %case0
 ; CHECK-NEXT:    st.global.b32 [out], 0;
 ; CHECK-NEXT:    bra.uni $L__BB0_6;
diff --git a/llvm/test/CodeGen/NVPTX/ld-param-sink.ll b/llvm/test/CodeGen/NVPTX/ld-param-sink.ll
index 03523a3be50c2..dfb0e80d0907d 100644
--- a/llvm/test/CodeGen/NVPTX/ld-param-sink.ll
+++ b/llvm/test/CodeGen/NVPTX/ld-param-sink.ll
@@ -12,7 +12,7 @@ define ptr @foo(i1 %cond) {
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<2>;
 ; CHECK-NEXT:    .reg .b16 %rs<3>;
-; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    ld.param.b8 %rs1, [foo_param_0];
@@ -21,14 +21,14 @@ define ptr @foo(i1 %cond) {
 ; CHECK-NEXT:    { // callseq 0, 0
 ; CHECK-NEXT:    .param .b64 retval0;
 ; CHECK-NEXT:    call.uni (retval0), baz, ();
-; CHECK-NEXT:    ld.param.b64 %rd2, [retval0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [retval0];
 ; CHECK-NEXT:    } // callseq 0
 ; CHECK-NEXT:    @%p1 bra $L__BB0_2;
 ; CHECK-NEXT:  // %bb.1: // %bb
 ; CHECK-NEXT:    { // callseq 1, 0
 ; CHECK-NEXT:    .param .b64 param0;
 ; CHECK-NEXT:    .param .b64 retval0;
-; CHECK-NEXT:    st.param.b64 [param0], %rd2;
+; CHECK-NEXT:    st.param.b64 [param0], %rd1;
 ; CHECK-NEXT:    call.uni (retval0), bar, (param0);
 ; CHECK-NEXT:    } // callseq 1
 ; CHECK-NEXT:  $L__BB0_2: // %common.ret
diff --git a/llvm/test/CodeGen/NVPTX/ldparam-v4.ll b/llvm/test/CodeGen/NVPTX/ldparam-v4.ll
index efa2666090ccc..3ac8f65ff858b 100644
--- a/llvm/test/CodeGen/NVPTX/ldparam-v4.ll
+++ b/llvm/test/CodeGen/NVPTX/ldparam-v4.ll
@@ -7,7 +7,7 @@ declare <4 x float> @bar()
 define void @foo(ptr %ptr) {
 ; CHECK-LABEL: foo(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<6>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [foo_param_0];
diff --git a/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll b/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll
index ec8dd0c5c9350..d542fa58684a1 100644
--- a/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll
+++ b/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll
@@ -593,51 +593,51 @@ define ptx_kernel void @foo19(ptr noalias readonly %from, ptr %to, i32 %n) {
 ; SM20-LABEL: foo19(
 ; SM20:       {
 ; SM20-NEXT:    .reg .pred %p<2>;
-; SM20-NEXT:    .reg .b32 %r<10>;
-; SM20-NEXT:    .reg .b64 %rd<8>;
+; SM20-NEXT:    .reg .b32 %r<4>;
+; SM20-NEXT:    .reg .b64 %rd<5>;
 ; SM20-EMPTY:
 ; SM20-NEXT:  // %bb.0: // %entry
-; SM20-NEXT:    ld.param.b32 %r8, [foo19_param_2];
-; SM20-NEXT:    ld.param.b64 %rd5, [foo19_param_0];
-; SM20-NEXT:    cvta.to.global.u64 %rd7, %rd5;
-; SM20-NEXT:    ld.param.b64 %rd6, [foo19_param_1];
-; SM20-NEXT:    cvta.to.global.u64 %rd2, %rd6;
-; SM20-NEXT:    mov.b32 %r9, 0f00000000;
+; SM20-NEXT:    ld.param.b32 %r2, [foo19_param_2];
+; SM20-NEXT:    ld.param.b64 %rd2, [foo19_param_0];
+; SM20-NEXT:    cvta.to.global.u64 %rd4, %rd2;
+; SM20-NEXT:    ld.param.b64 %rd3, [foo19_param_1];
+; SM20-NEXT:    cvta.to.global.u64 %rd1, %rd3;
+; SM20-NEXT:    mov.b32 %r3, 0f00000000;
 ; SM20-NEXT:  $L__BB18_1: // %loop
 ; SM20-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM20-NEXT:    ld.global.b32 %r7, [%rd7];
-; SM20-NEXT:    add.rn.f32 %r9, %r7, %r9;
-; SM20-NEXT:    add.s64 %rd7, %rd7, 4;
-; SM20-NEXT:    add.s32 %r8, %r8, -1;
-; SM20-NEXT:    setp.ne.b32 %p1, %r8, 0;
+; SM20-NEXT:    ld.global.b32 %r1, [%rd4];
+; SM20-NEXT:    add.rn.f32 %r3, %r1, %r3;
+; SM20-NEXT:    add.s64 %rd4, %rd4, 4;
+; SM20-NEXT:    add.s32 %r2, %r2, -1;
+; SM20-NEXT:    setp.ne.b32 %p1, %r2, 0;
 ; SM20-NEXT:    @%p1 bra $L__BB18_1;
 ; SM20-NEXT:  // %bb.2: // %exit
-; SM20-NEXT:    st.global.b32 [%rd2], %r9;
+; SM20-NEXT:    st.global.b32 [%rd1], %r3;
 ; SM20-NEXT:    ret;
 ;
 ; SM35-LABEL: foo19(
 ; SM35:       {
 ; SM35-NEXT:    .reg .pred %p<2>;
-; SM35-NEXT:    .reg .b32 %r<10>;
-; SM35-NEXT:    .reg .b64 %rd<8>;
+; SM35-NEXT:    .reg .b32 %r<4>;
+; SM35-NEXT:    .reg .b64 %rd<5>;
 ; SM35-EMPTY:
 ; SM35-NEXT:  // %bb.0: // %entry
-; SM35-NEXT:    ld.param.b32 %r8, [foo19_param_2];
-; SM35-NEXT:    ld.param.b64 %rd5, [foo19_param_0];
-; SM35-NEXT:    cvta.to.global.u64 %rd7, %rd5;
-; SM35-NEXT:    ld.param.b64 %rd6, [foo19_param_1];
-; SM35-NEXT:    cvta.to.global.u64 %rd2, %rd6;
-; SM35-NEXT:    mov.b32 %r9, 0f00000000;
+; SM35-NEXT:    ld.param.b32 %r2, [foo19_param_2];
+; SM35-NEXT:    ld.param.b64 %rd2, [foo19_param_0];
+; SM35-NEXT:    cvta.to.global.u64 %rd4, %rd2;
+; SM35-NEXT:    ld.param.b64 %rd3, [foo19_param_1];
+; SM35-NEXT:    cvta.to.global.u64 %rd1, %rd3;
+; SM35-NEXT:    mov.b32 %r3, 0f00000000;
 ; SM35-NEXT:  $L__BB18_1: // %loop
 ; SM35-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM35-NEXT:    ld.global.nc.b32 %r7, [%rd7];
-; SM35-NEXT:    add.rn.f32 %r9, %r7, %r9;
-; SM35-NEXT:    add.s64 %rd7, %rd7, 4;
-; SM35-NEXT:    add.s32 %r8, %r8, -1;
-; SM35-NEXT:    setp.ne.b32 %p1, %r8, 0;
+; SM35-NEXT:    ld.global.nc.b32 %r1, [%rd4];
+; SM35-NEXT:    add.rn.f32 %r3, %r1, %r3;
+; SM35-NEXT:    add.s64 %rd4, %rd4, 4;
+; SM35-NEXT:    add.s32 %r2, %r2, -1;
+; SM35-NEXT:    setp.ne.b32 %p1, %r2, 0;
 ; SM35-NEXT:    @%p1 bra $L__BB18_1;
 ; SM35-NEXT:  // %bb.2: // %exit
-; SM35-NEXT:    st.global.b32 [%rd2], %r9;
+; SM35-NEXT:    st.global.b32 [%rd1], %r3;
 ; SM35-NEXT:    ret;
 entry:
   br label %loop
diff --git a/llvm/test/CodeGen/NVPTX/local-stack-frame.ll b/llvm/test/CodeGen/NVPTX/local-stack-frame.ll
index ae069cf956c36..f7137e05a5e4f 100644
--- a/llvm/test/CodeGen/NVPTX/local-stack-frame.ll
+++ b/llvm/test/CodeGen/NVPTX/local-stack-frame.ll
@@ -12,13 +12,13 @@ define void @foo(i32 %a) {
 ; PTX32-NEXT:    .local .align 4 .b8 __local_depot0[4];
 ; PTX32-NEXT:    .reg .b32 %SP;
 ; PTX32-NEXT:    .reg .b32 %SPL;
-; PTX32-NEXT:    .reg .b32 %r<4>;
+; PTX32-NEXT:    .reg .b32 %r<3>;
 ; PTX32-EMPTY:
 ; PTX32-NEXT:  // %bb.0:
 ; PTX32-NEXT:    mov.b32 %SPL, __local_depot0;
 ; PTX32-NEXT:    ld.param.b32 %r1, [foo_param_0];
-; PTX32-NEXT:    add.u32 %r3, %SPL, 0;
-; PTX32-NEXT:    st.local.b32 [%r3], %r1;
+; PTX32-NEXT:    add.u32 %r2, %SPL, 0;
+; PTX32-NEXT:    st.local.b32 [%r2], %r1;
 ; PTX32-NEXT:    ret;
 ;
 ; PTX64-LABEL: foo(
@@ -27,13 +27,13 @@ define void @foo(i32 %a) {
 ; PTX64-NEXT:    .reg .b64 %SP;
 ; PTX64-NEXT:    .reg .b64 %SPL;
 ; PTX64-NEXT:    .reg .b32 %r<2>;
-; PTX64-NEXT:    .reg .b64 %rd<3>;
+; PTX64-NEXT:    .reg .b64 %rd<2>;
 ; PTX64-EMPTY:
 ; PTX64-NEXT:  // %bb.0:
 ; PTX64-NEXT:    mov.b64 %SPL, __local_depot0;
 ; PTX64-NEXT:    ld.param.b32 %r1, [foo_param_0];
-; PTX64-NEXT:    add.u64 %rd2, %SPL, 0;
-; PTX64-NEXT:    st.local.b32 [%rd2], %r1;
+; PTX64-NEXT:    add.u64 %rd1, %SPL, 0;
+; PTX64-NEXT:    st.local.b32 [%rd1], %r1;
 ; PTX64-NEXT:    ret;
   %local = alloca i32, align 4
   store volatile i32 %a, ptr %local
@@ -97,15 +97,15 @@ define void @foo3(i32 %a) {
 ; PTX32-NEXT:    .local .align 4 .b8 __local_depot2[12];
 ; PTX32-NEXT:    .reg .b32 %SP;
 ; PTX32-NEXT:    .reg .b32 %SPL;
-; PTX32-NEXT:    .reg .b32 %r<6>;
+; PTX32-NEXT:    .reg .b32 %r<5>;
 ; PTX32-EMPTY:
 ; PTX32-NEXT:  // %bb.0:
 ; PTX32-NEXT:    mov.b32 %SPL, __local_depot2;
 ; PTX32-NEXT:    ld.param.b32 %r1, [foo3_param_0];
-; PTX32-NEXT:    add.u32 %r3, %SPL, 0;
-; PTX32-NEXT:    shl.b32 %r4, %r1, 2;
-; PTX32-NEXT:    add.s32 %r5, %r3, %r4;
-; PTX32-NEXT:    st.local.b32 [%r5], %r1;
+; PTX32-NEXT:    add.u32 %r2, %SPL, 0;
+; PTX32-NEXT:    shl.b32 %r3, %r1, 2;
+; PTX32-NEXT:    add.s32 %r4, %r2, %r3;
+; PTX32-NEXT:    st.local.b32 [%r4], %r1;
 ; PTX32-NEXT:    ret;
 ;
 ; PTX64-LABEL: foo3(
@@ -114,14 +114,14 @@ define void @foo3(i32 %a) {
 ; PTX64-NEXT:    .reg .b64 %SP;
 ; PTX64-NEXT:    .reg .b64 %SPL;
 ; PTX64-NEXT:    .reg .b32 %r<2>;
-; PTX64-NEXT:    .reg .b64 %rd<4>;
+; PTX64-NEXT:    .reg .b64 %rd<3>;
 ; PTX64-EMPTY:
 ; PTX64-NEXT:  // %bb.0:
 ; PTX64-NEXT:    mov.b64 %SPL, __local_depot2;
 ; PTX64-NEXT:    ld.param.b32 %r1, [foo3_param_0];
-; PTX64-NEXT:    add.u64 %rd2, %SPL, 0;
-; PTX64-NEXT:    mad.wide.s32 %rd3, %r1, 4, %rd2;
-; PTX64-NEXT:    st.local.b32 [%rd3], %r1;
+; PTX64-NEXT:    add.u64 %rd1, %SPL, 0;
+; PTX64-NEXT:    mad.wide.s32 %rd2, %r1, 4, %rd1;
+; PTX64-NEXT:    st.local.b32 [%rd2], %r1;
 ; PTX64-NEXT:    ret;
   %local = alloca [3 x i32], align 4
   %1 = getelementptr inbounds i32, ptr %local, i32 %a
diff --git a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
index f5df0fcde1883..8adde4ceefbf4 100644
--- a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
@@ -340,19 +340,19 @@ define ptx_kernel void @grid_const_phi(ptr byval(%struct.s) align 4 %input1, ptr
 ; PTX:       {
 ; PTX-NEXT:    .reg .pred %p<2>;
 ; PTX-NEXT:    .reg .b32 %r<3>;
-; PTX-NEXT:    .reg .b64 %rd<7>;
+; PTX-NEXT:    .reg .b64 %rd<4>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
-; PTX-NEXT:    mov.b64 %rd6, grid_const_phi_param_0;
-; PTX-NEXT:    ld.param.b64 %rd5, [grid_const_phi_param_1];
-; PTX-NEXT:    cvta.to.global.u64 %rd1, %rd5;
+; PTX-NEXT:    mov.b64 %rd3, grid_const_phi_param_0;
+; PTX-NEXT:    ld.param.b64 %rd2, [grid_const_phi_param_1];
+; PTX-NEXT:    cvta.to.global.u64 %rd1, %rd2;
 ; PTX-NEXT:    ld.global.b32 %r1, [%rd1];
 ; PTX-NEXT:    setp.lt.s32 %p1, %r1, 0;
 ; PTX-NEXT:    @%p1 bra $L__BB9_2;
 ; PTX-NEXT:  // %bb.1: // %second
-; PTX-NEXT:    add.s64 %rd6, %rd6, 4;
+; PTX-NEXT:    add.s64 %rd3, %rd3, 4;
 ; PTX-NEXT:  $L__BB9_2: // %merge
-; PTX-NEXT:    ld.param.b32 %r2, [%rd6];
+; PTX-NEXT:    ld.param.b32 %r2, [%rd3];
 ; PTX-NEXT:    st.global.b32 [%rd1], %r2;
 ; PTX-NEXT:    ret;
 ; OPT-LABEL: define ptx_kernel void @grid_const_phi(
@@ -396,20 +396,20 @@ define ptx_kernel void @grid_const_phi_ngc(ptr byval(%struct.s) align 4 %input1,
 ; PTX:       {
 ; PTX-NEXT:    .reg .pred %p<2>;
 ; PTX-NEXT:    .reg .b32 %r<3>;
-; PTX-NEXT:    .reg .b64 %rd<8>;
+; PTX-NEXT:    .reg .b64 %rd<5>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
-; PTX-NEXT:    mov.b64 %rd7, grid_const_phi_ngc_param_0;
-; PTX-NEXT:    ld.param.b64 %rd6, [grid_const_phi_ngc_param_2];
-; PTX-NEXT:    cvta.to.global.u64 %rd1, %rd6;
+; PTX-NEXT:    mov.b64 %rd4, grid_const_phi_ngc_param_0;
+; PTX-NEXT:    ld.param.b64 %rd3, [grid_const_phi_ngc_param_2];
+; PTX-NEXT:    cvta.to.global.u64 %rd1, %rd3;
 ; PTX-NEXT:    ld.global.b32 %r1, [%rd1];
 ; PTX-NEXT:    setp.lt.s32 %p1, %r1, 0;
 ; PTX-NEXT:    @%p1 bra $L__BB10_2;
 ; PTX-NEXT:  // %bb.1: // %second
 ; PTX-NEXT:    mov.b64 %rd2, grid_const_phi_ngc_param_1;
-; PTX-NEXT:    add.s64 %rd7, %rd2, 4;
+; PTX-NEXT:    add.s64 %rd4, %rd2, 4;
 ; PTX-NEXT:  $L__BB10_2: // %merge
-; PTX-NEXT:    ld.param.b32 %r2, [%rd7];
+; PTX-NEXT:    ld.param.b32 %r2, [%rd4];
 ; PTX-NEXT:    st.global.b32 [%rd1], %r2;
 ; PTX-NEXT:    ret;
 ; OPT-LABEL: define ptx_kernel void @grid_const_phi_ngc(
diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
index 4d36ff9496ede..21257e21bea9f 100644
--- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
@@ -456,63 +456,63 @@ define dso_local ptx_kernel void @memcpy_to_param(ptr nocapture noundef readonly
 ; PTX-NEXT:    .reg .b64 %SP;
 ; PTX-NEXT:    .reg .b64 %SPL;
 ; PTX-NEXT:    .reg .b32 %r<3>;
-; PTX-NEXT:    .reg .b64 %rd<48>;
+; PTX-NEXT:    .reg .b64 %rd<47>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0: // %entry
 ; PTX-NEXT:    mov.b64 %SPL, __local_depot9;
 ; PTX-NEXT:    cvta.local.u64 %SP, %SPL;
 ; PTX-NEXT:    ld.param.b64 %rd1, [memcpy_to_param_param_0];
-; PTX-NEXT:    add.u64 %rd3, %SPL, 0;
+; PTX-NEXT:    add.u64 %rd2, %SPL, 0;
 ; PTX-NEXT:    ld.param.b32 %r1, [memcpy_to_param_param_1+4];
-; PTX-NEXT:    st.local.b32 [%rd3+4], %r1;
+; PTX-NEXT:    st.local.b32 [%rd2+4], %r1;
 ; PTX-NEXT:    ld.param.b32 %r2, [memcpy_to_param_param_1];
-; PTX-NEXT:    st.local.b32 [%rd3], %r2;
-; PTX-NEXT:    ld.volatile.b8 %rd4, [%rd1];
-; PTX-NEXT:    ld.volatile.b8 %rd5, [%rd1+1];
-; PTX-NEXT:    shl.b64 %rd6, %rd5, 8;
-; PTX-NEXT:    or.b64 %rd7, %rd6, %rd4;
-; PTX-NEXT:    ld.volatile.b8 %rd8, [%rd1+2];
-; PTX-NEXT:    shl.b64 %rd9, %rd8, 16;
-; PTX-NEXT:    ld.volatile.b8 %rd10, [%rd1+3];
-; PTX-NEXT:    shl.b64 %rd11, %rd10, 24;
-; PTX-NEXT:    or.b64 %rd12, %rd11, %rd9;
-; PTX-NEXT:    or.b64 %rd13, %rd12, %rd7;
-; PTX-NEXT:    ld.volatile.b8 %rd14, [%rd1+4];
-; PTX-NEXT:    ld.volatile.b8 %rd15, [%rd1+5];
-; PTX-NEXT:    shl.b64 %rd16, %rd15, 8;
-; PTX-NEXT:    or.b64 %rd17, %rd16, %rd14;
-; PTX-NEXT:    ld.volatile.b8 %rd18, [%rd1+6];
-; PTX-NEXT:    shl.b64 %rd19, %rd18, 16;
-; PTX-NEXT:    ld.volatile.b8 %rd20, [%rd1+7];
-; PTX-NEXT:    shl.b64 %rd21, %rd20, 24;
-; PTX-NEXT:    or.b64 %rd22, %rd21, %rd19;
-; PTX-NEXT:    or.b64 %rd23, %rd22, %rd17;
-; PTX-NEXT:    shl.b64 %rd24, %rd23, 32;
-; PTX-NEXT:    or.b64 %rd25, %rd24, %rd13;
-; PTX-NEXT:    st.volatile.b64 [%SP], %rd25;
-; PTX-NEXT:    ld.volatile.b8 %rd26, [%rd1+8];
-; PTX-NEXT:    ld.volatile.b8 %rd27, [%rd1+9];
-; PTX-NEXT:    shl.b64 %rd28, %rd27, 8;
-; PTX-NEXT:    or.b64 %rd29, %rd28, %rd26;
-; PTX-NEXT:    ld.volatile.b8 %rd30, [%rd1+10];
-; PTX-NEXT:    shl.b64 %rd31, %rd30, 16;
-; PTX-NEXT:    ld.volatile.b8 %rd32, [%rd1+11];
-; PTX-NEXT:    shl.b64 %rd33, %rd32, 24;
-; PTX-NEXT:    or.b64 %rd34, %rd33, %rd31;
-; PTX-NEXT:    or.b64 %rd35, %rd34, %rd29;
-; PTX-NEXT:    ld.volatile.b8 %rd36, [%rd1+12];
-; PTX-NEXT:    ld.volatile.b8 %rd37, [%rd1+13];
-; PTX-NEXT:    shl.b64 %rd38, %rd37, 8;
-; PTX-NEXT:    or.b64 %rd39, %rd38, %rd36;
-; PTX-NEXT:    ld.volatile.b8 %rd40, [%rd1+14];
-; PTX-NEXT:    shl.b64 %rd41, %rd40, 16;
-; PTX-NEXT:    ld.volatile.b8 %rd42, [%rd1+15];
-; PTX-NEXT:    shl.b64 %rd43, %rd42, 24;
-; PTX-NEXT:    or.b64 %rd44, %rd43, %rd41;
-; PTX-NEXT:    or.b64 %rd45, %rd44, %rd39;
-; PTX-NEXT:    shl.b64 %rd46, %rd45, 32;
-; PTX-NEXT:    or.b64 %rd47, %rd46, %rd35;
-; PTX-NEXT:    st.volatile.b64 [%SP+8], %rd47;
+; PTX-NEXT:    st.local.b32 [%rd2], %r2;
+; PTX-NEXT:    ld.volatile.b8 %rd3, [%rd1];
+; PTX-NEXT:    ld.volatile.b8 %rd4, [%rd1+1];
+; PTX-NEXT:    shl.b64 %rd5, %rd4, 8;
+; PTX-NEXT:    or.b64 %rd6, %rd5, %rd3;
+; PTX-NEXT:    ld.volatile.b8 %rd7, [%rd1+2];
+; PTX-NEXT:    shl.b64 %rd8, %rd7, 16;
+; PTX-NEXT:    ld.volatile.b8 %rd9, [%rd1+3];
+; PTX-NEXT:    shl.b64 %rd10, %rd9, 24;
+; PTX-NEXT:    or.b64 %rd11, %rd10, %rd8;
+; PTX-NEXT:    or.b64 %rd12, %rd11, %rd6;
+; PTX-NEXT:    ld.volatile.b8 %rd13, [%rd1+4];
+; PTX-NEXT:    ld.volatile.b8 %rd14, [%rd1+5];
+; PTX-NEXT:    shl.b64 %rd15, %rd14, 8;
+; PTX-NEXT:    or.b64 %rd16, %rd15, %rd13;
+; PTX-NEXT:    ld.volatile.b8 %rd17, [%rd1+6];
+; PTX-NEXT:    shl.b64 %rd18, %rd17, 16;
+; PTX-NEXT:    ld.volatile.b8 %rd19, [%rd1+7];
+; PTX-NEXT:    shl.b64 %rd20, %rd19, 24;
+; PTX-NEXT:    or.b64 %rd21, %rd20, %rd18;
+; PTX-NEXT:    or.b64 %rd22, %rd21, %rd16;
+; PTX-NEXT:    shl.b64 %rd23, %rd22, 32;
+; PTX-NEXT:    or.b64 %rd24, %rd23, %rd12;
+; PTX-NEXT:    st.volatile.b64 [%SP], %rd24;
+; PTX-NEXT:    ld.volatile.b8 %rd25, [%rd1+8];
+; PTX-NEXT:    ld.volatile.b8 %rd26, [%rd1+9];
+; PTX-NEXT:    shl.b64 %rd27, %rd26, 8;
+; PTX-NEXT:    or.b64 %rd28, %rd27, %rd25;
+; PTX-NEXT:    ld.volatile.b8 %rd29, [%rd1+10];
+; PTX-NEXT:    shl.b64 %rd30, %rd29, 16;
+; PTX-NEXT:    ld.volatile.b8 %rd31, [%rd1+11];
+; PTX-NEXT:    shl.b64 %rd32, %rd31, 24;
+; PTX-NEXT:    or.b64 %rd33, %rd32, %rd30;
+; PTX-NEXT:    or.b64 %rd34, %rd33, %rd28;
+; PTX-NEXT:    ld.volatile.b8 %rd35, [%rd1+12];
+; PTX-NEXT:    ld.volatile.b8 %rd36, [%rd1+13];
+; PTX-NEXT:    shl.b64 %rd37, %rd36, 8;
+; PTX-NEXT:    or.b64 %rd38, %rd37, %rd35;
+; PTX-NEXT:    ld.volatile.b8 %rd39, [%rd1+14];
+; PTX-NEXT:    shl.b64 %rd40, %rd39, 16;
+; PTX-NEXT:    ld.volatile.b8 %rd41, [%rd1+15];
+; PTX-NEXT:    shl.b64 %rd42, %rd41, 24;
+; PTX-NEXT:    or.b64 %rd43, %rd42, %rd40;
+; PTX-NEXT:    or.b64 %rd44, %rd43, %rd38;
+; PTX-NEXT:    shl.b64 %rd45, %rd44, 32;
+; PTX-NEXT:    or.b64 %rd46, %rd45, %rd34;
+; PTX-NEXT:    st.volatile.b64 [%SP+8], %rd46;
 ; PTX-NEXT:    ret;
 entry:
   tail call void @llvm.memcpy.p0.p0.i64(ptr %s, ptr %in, i64 16, i1 true)
@@ -651,7 +651,7 @@ define ptx_kernel void @test_select_write(ptr byval(i32) align 4 %input1, ptr by
 ; PTX-NEXT:    .reg .pred %p<2>;
 ; PTX-NEXT:    .reg .b16 %rs<3>;
 ; PTX-NEXT:    .reg .b32 %r<3>;
-; PTX-NEXT:    .reg .b64 %rd<6>;
+; PTX-NEXT:    .reg .b64 %rd<4>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0: // %bb
 ; PTX-NEXT:    mov.b64 %SPL, __local_depot12;
@@ -663,10 +663,10 @@ define ptx_kernel void @test_select_write(ptr byval(i32) align 4 %input1, ptr by
 ; PTX-NEXT:    st.b32 [%SP], %r1;
 ; PTX-NEXT:    ld.param.b32 %r2, [test_select_write_param_0];
 ; PTX-NEXT:    st.b32 [%SP+4], %r2;
-; PTX-NEXT:    add.u64 %rd2, %SPL, 4;
-; PTX-NEXT:    add.u64 %rd4, %SPL, 0;
-; PTX-NEXT:    selp.b64 %rd5, %rd2, %rd4, %p1;
-; PTX-NEXT:    st.local.b32 [%rd5], 1;
+; PTX-NEXT:    add.u64 %rd1, %SPL, 4;
+; PTX-NEXT:    add.u64 %rd2, %SPL, 0;
+; PTX-NEXT:    selp.b64 %rd3, %rd1, %rd2, %p1;
+; PTX-NEXT:    st.local.b32 [%rd3], 1;
 ; PTX-NEXT:    ret;
 bb:
   %ptrnew = select i1 %cond, ptr %input1, ptr %input2
@@ -743,7 +743,7 @@ define ptx_kernel void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval
 ; PTX_60:       {
 ; PTX_60-NEXT:    .reg .pred %p<2>;
 ; PTX_60-NEXT:    .reg .b16 %rs<3>;
-; PTX_60-NEXT:    .reg .b32 %r<5>;
+; PTX_60-NEXT:    .reg .b32 %r<2>;
 ; PTX_60-NEXT:    .reg .b64 %rd<3>;
 ; PTX_60-EMPTY:
 ; PTX_60-NEXT:  // %bb.0: // %bb
@@ -752,12 +752,12 @@ define ptx_kernel void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval
 ; PTX_60-NEXT:    setp.ne.b16 %p1, %rs2, 0;
 ; PTX_60-NEXT:    ld.param.b64 %rd2, [test_phi_param_2];
 ; PTX_60-NEXT:    cvta.to.global.u64 %rd1, %rd2;
-; PTX_60-NEXT:    ld.param.b32 %r4, [test_phi_param_0];
+; PTX_60-NEXT:    ld.param.b32 %r1, [test_phi_param_0];
 ; PTX_60-NEXT:    @%p1 bra $L__BB13_2;
 ; PTX_60-NEXT:  // %bb.1: // %second
-; PTX_60-NEXT:    ld.param.b32 %r4, [test_phi_param_1+4];
+; PTX_60-NEXT:    ld.param.b32 %r1, [test_phi_param_1+4];
 ; PTX_60-NEXT:  $L__BB13_2: // %merge
-; PTX_60-NEXT:    st.global.b32 [%rd1], %r4;
+; PTX_60-NEXT:    st.global.b32 [%rd1], %r1;
 ; PTX_60-NEXT:    ret;
 ;
 ; PTX_70-LABEL: test_phi(
@@ -765,21 +765,21 @@ define ptx_kernel void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval
 ; PTX_70-NEXT:    .reg .pred %p<2>;
 ; PTX_70-NEXT:    .reg .b16 %rs<3>;
 ; PTX_70-NEXT:    .reg .b32 %r<2>;
-; PTX_70-NEXT:    .reg .b64 %rd<8>;
+; PTX_70-NEXT:    .reg .b64 %rd<5>;
 ; PTX_70-EMPTY:
 ; PTX_70-NEXT:  // %bb.0: // %bb
 ; PTX_70-NEXT:    ld.param.b8 %rs1, [test_phi_param_3];
 ; PTX_70-NEXT:    and.b16 %rs2, %rs1, 1;
 ; PTX_70-NEXT:    setp.ne.b16 %p1, %rs2, 0;
-; PTX_70-NEXT:    mov.b64 %rd7, test_phi_param_0;
-; PTX_70-NEXT:    ld.param.b64 %rd6, [test_phi_param_2];
-; PTX_70-NEXT:    cvta.to.global.u64 %rd1, %rd6;
+; PTX_70-NEXT:    mov.b64 %rd4, test_phi_param_0;
+; PTX_70-NEXT:    ld.param.b64 %rd3, [test_phi_param_2];
+; PTX_70-NEXT:    cvta.to.global.u64 %rd1, %rd3;
 ; PTX_70-NEXT:    @%p1 bra $L__BB13_2;
 ; PTX_70-NEXT:  // %bb.1: // %second
 ; PTX_70-NEXT:    mov.b64 %rd2, test_phi_param_1;
-; PTX_70-NEXT:    add.s64 %rd7, %rd2, 4;
+; PTX_70-NEXT:    add.s64 %rd4, %rd2, 4;
 ; PTX_70-NEXT:  $L__BB13_2: // %merge
-; PTX_70-NEXT:    ld.param.b32 %r1, [%rd7];
+; PTX_70-NEXT:    ld.param.b32 %r1, [%rd4];
 ; PTX_70-NEXT:    st.global.b32 [%rd1], %r1;
 ; PTX_70-NEXT:    ret;
 bb:
@@ -830,7 +830,7 @@ define ptx_kernel void @test_phi_write(ptr byval(%struct.S) align 4 %input1, ptr
 ; PTX-NEXT:    .reg .pred %p<2>;
 ; PTX-NEXT:    .reg .b16 %rs<3>;
 ; PTX-NEXT:    .reg .b32 %r<3>;
-; PTX-NEXT:    .reg .b64 %rd<7>;
+; PTX-NEXT:    .reg .b64 %rd<3>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0: // %bb
 ; PTX-NEXT:    mov.b64 %SPL, __local_depot14;
@@ -841,14 +841,14 @@ define ptx_kernel void @test_phi_write(ptr byval(%struct.S) align 4 %input1, ptr
 ; PTX-NEXT:    add.u64 %rd1, %SPL, 0;
 ; PTX-NEXT:    ld.param.b32 %r1, [test_phi_write_param_1+4];
 ; PTX-NEXT:    st.b32 [%SP], %r1;
-; PTX-NEXT:    add.u64 %rd6, %SPL, 4;
+; PTX-NEXT:    add.u64 %rd2, %SPL, 4;
 ; PTX-NEXT:    ld.param.b32 %r2, [test_phi_write_param_0];
 ; PTX-NEXT:    st.b32 [%SP+4], %r2;
 ; PTX-NEXT:    @%p1 bra $L__BB14_2;
 ; PTX-NEXT:  // %bb.1: // %second
-; PTX-NEXT:    mov.b64 %rd6, %rd1;
+; PTX-NEXT:    mov.b64 %rd2, %rd1;
 ; PTX-NEXT:  $L__BB14_2: // %merge
-; PTX-NEXT:    st.local.b32 [%rd6], 1;
+; PTX-NEXT:    st.local.b32 [%rd2], 1;
 ; PTX-NEXT:    ret;
 bb:
   br i1 %cond, label %first, label %second
@@ -882,13 +882,13 @@ define ptx_kernel void @test_forward_byval_arg(ptr byval(i32) align 4 %input) {
 ; PTX-NEXT:    .reg .b64 %SP;
 ; PTX-NEXT:    .reg .b64 %SPL;
 ; PTX-NEXT:    .reg .b32 %r<2>;
-; PTX-NEXT:    .reg .b64 %rd<3>;
+; PTX-NEXT:    .reg .b64 %rd<2>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
 ; PTX-NEXT:    mov.b64 %SPL, __local_depot15;
-; PTX-NEXT:    add.u64 %rd2, %SPL, 0;
+; PTX-NEXT:    add.u64 %rd1, %SPL, 0;
 ; PTX-NEXT:    ld.param.b32 %r1, [test_forward_byval_arg_param_0];
-; PTX-NEXT:    st.local.b32 [%rd2], %r1;
+; PTX-NEXT:    st.local.b32 [%rd1], %r1;
 ; PTX-NEXT:    { // callseq 2, 0
 ; PTX-NEXT:    .param .align 4 .b8 param0[4];
 ; PTX-NEXT:    st.param.b32 [param0], %r1;
@@ -908,7 +908,6 @@ define void @device_func(ptr byval(i32) align 4 %input) {
 ; PTX-LABEL: device_func(
 ; PTX:       {
 ; PTX-NEXT:    .reg .b32 %r<2>;
-; PTX-NEXT:    .reg .b64 %rd<2>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
 ; PTX-NEXT:    { // callseq 3, 0
diff --git a/llvm/test/CodeGen/NVPTX/misched_func_call.ll b/llvm/test/CodeGen/NVPTX/misched_func_call.ll
index b2994c0a97585..62f99e991ea1e 100644
--- a/llvm/test/CodeGen/NVPTX/misched_func_call.ll
+++ b/llvm/test/CodeGen/NVPTX/misched_func_call.ll
@@ -7,14 +7,14 @@ target triple = "nvptx64-nvidia-cuda"
 define ptx_kernel void @wombat(i32 %arg, i32 %arg1, i32 %arg2) {
 ; CHECK-LABEL: wombat(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<11>;
+; CHECK-NEXT:    .reg .b32 %r<8>;
 ; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %bb
-; CHECK-NEXT:    ld.param.b32 %r4, [wombat_param_2];
-; CHECK-NEXT:    ld.param.b32 %r3, [wombat_param_1];
-; CHECK-NEXT:    ld.param.b32 %r2, [wombat_param_0];
-; CHECK-NEXT:    mov.b32 %r10, 0;
+; CHECK-NEXT:    ld.param.b32 %r3, [wombat_param_2];
+; CHECK-NEXT:    ld.param.b32 %r2, [wombat_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [wombat_param_0];
+; CHECK-NEXT:    mov.b32 %r7, 0;
 ; CHECK-NEXT:  $L__BB0_1: // %bb3
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    { // callseq 0, 0
@@ -23,15 +23,15 @@ define ptx_kernel void @wombat(i32 %arg, i32 %arg1, i32 %arg2) {
 ; CHECK-NEXT:    st.param.b64 [param0], 0;
 ; CHECK-NEXT:    call.uni (retval0), quux, (param0);
 ; CHECK-NEXT:    } // callseq 0
-; CHECK-NEXT:    mul.lo.s32 %r7, %r10, %r3;
-; CHECK-NEXT:    or.b32 %r8, %r4, %r7;
-; CHECK-NEXT:    mul.lo.s32 %r9, %r2, %r8;
-; CHECK-NEXT:    cvt.rn.f64.s32 %rd1, %r9;
-; CHECK-NEXT:    cvt.rn.f64.u32 %rd2, %r10;
+; CHECK-NEXT:    mul.lo.s32 %r4, %r7, %r2;
+; CHECK-NEXT:    or.b32 %r5, %r3, %r4;
+; CHECK-NEXT:    mul.lo.s32 %r6, %r1, %r5;
+; CHECK-NEXT:    cvt.rn.f64.s32 %rd1, %r6;
+; CHECK-NEXT:    cvt.rn.f64.u32 %rd2, %r7;
 ; CHECK-NEXT:    add.rn.f64 %rd3, %rd2, %rd1;
 ; CHECK-NEXT:    mov.b64 %rd4, 0;
 ; CHECK-NEXT:    st.global.b64 [%rd4], %rd3;
-; CHECK-NEXT:    mov.b32 %r10, 1;
+; CHECK-NEXT:    mov.b32 %r7, 1;
 ; CHECK-NEXT:    bra.uni $L__BB0_1;
 bb:
   br label %bb3
diff --git a/llvm/test/CodeGen/NVPTX/param-add.ll b/llvm/test/CodeGen/NVPTX/param-add.ll
index c5ea9f850ea1f..06d7384200696 100644
--- a/llvm/test/CodeGen/NVPTX/param-add.ll
+++ b/llvm/test/CodeGen/NVPTX/param-add.ll
@@ -14,7 +14,7 @@ declare i32 @callee(%struct.1float %a)
 define i32 @test(%struct.1float alignstack(32) %data) {
 ; CHECK-LABEL: test(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-NEXT:    .reg .b32 %r<6>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_param_0];
diff --git a/llvm/test/CodeGen/NVPTX/param-overalign.ll b/llvm/test/CodeGen/NVPTX/param-overalign.ll
index 2155fb4031c36..8899709d1cf15 100644
--- a/llvm/test/CodeGen/NVPTX/param-overalign.ll
+++ b/llvm/test/CodeGen/NVPTX/param-overalign.ll
@@ -21,7 +21,7 @@ target triple = "nvptx64-nvidia-cuda"
 define float @caller_md(float %a, float %b) {
 ; CHECK-LABEL: caller_md(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [caller_md_param_0];
@@ -62,7 +62,7 @@ define float @callee_md(%struct.float2 alignstack(8) %a) {
 define float @caller(float %a, float %b) {
 ; CHECK-LABEL: caller(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [caller_param_0];
diff --git a/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll b/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll
index 8056855a0d539..d443aebf32447 100644
--- a/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll
+++ b/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll
@@ -37,7 +37,7 @@ define ptx_kernel void @bar(ptr %red, i32 %idx) {
 ; CHECK-LABEL: bar(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<4>;
-; CHECK-NEXT:    .reg .b64 %rd<4>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [bar_param_0];
diff --git a/llvm/test/CodeGen/NVPTX/surf-write-cuda.ll b/llvm/test/CodeGen/NVPTX/surf-write-cuda.ll
index abc2ea89b62cf..c0ced65709610 100644
--- a/llvm/test/CodeGen/NVPTX/surf-write-cuda.ll
+++ b/llvm/test/CodeGen/NVPTX/surf-write-cuda.ll
@@ -34,7 +34,6 @@ define ptx_kernel void @bar(i32 %val, i32 %idx) {
 ; CHECK-LABEL: bar(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
-; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [bar_param_0];
diff --git a/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll b/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll
index 3138d7c4c14db..20f6e2ec50c2c 100644
--- a/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll
+++ b/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll
@@ -37,7 +37,7 @@ define ptx_kernel void @bar(ptr %red, i32 %idx) {
 ; CHECK-LABEL: bar(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<6>;
-; CHECK-NEXT:    .reg .b64 %rd<4>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [bar_param_0];
@@ -58,7 +58,7 @@ declare float @texfunc(i64)
 define ptx_kernel void @baz(ptr %red, i32 %idx) {
 ; CHECK-LABEL: baz(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-NEXT:    .reg .b32 %r<8>;
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
@@ -74,8 +74,8 @@ define ptx_kernel void @baz(ptr %red, i32 %idx) {
 ; CHECK-NEXT:    call.uni (retval0), texfunc, (param0);
 ; CHECK-NEXT:    ld.param.b32 %r6, [retval0];
 ; CHECK-NEXT:    } // callseq 0
-; CHECK-NEXT:    add.rn.f32 %r8, %r2, %r6;
-; CHECK-NEXT:    st.global.b32 [%rd2], %r8;
+; CHECK-NEXT:    add.rn.f32 %r7, %r2, %r6;
+; CHECK-NEXT:    st.global.b32 [%rd2], %r7;
 ; CHECK-NEXT:    ret;
   %texHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1(ptr addrspace(1) @tex0)
   %val = tail call { float, float, float, float } @llvm.nvvm.tex.unified.1d.v4f32.s32(i64 %texHandle, i32 %idx)
diff --git a/llvm/test/CodeGen/NVPTX/texsurf-queries.ll b/llvm/test/CodeGen/NVPTX/texsurf-queries.ll
index 4edbec48e6bec..c5299046e1db3 100644
--- a/llvm/test/CodeGen/NVPTX/texsurf-queries.ll
+++ b/llvm/test/CodeGen/NVPTX/texsurf-queries.ll
@@ -35,7 +35,6 @@ define i32 @t1() {
 ; CHECK-LABEL: t1(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    txq.width.b32 %r1, [tex0];
@@ -66,7 +65,6 @@ define i32 @t3() {
 ; CHECK-LABEL: t3(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    txq.height.b32 %r1, [tex0];
@@ -97,7 +95,6 @@ define i32 @s1() {
 ; CHECK-LABEL: s1(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    suq.width.b32 %r1, [surf0];
@@ -128,7 +125,6 @@ define i32 @s3() {
 ; CHECK-LABEL: s3(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    suq.height.b32 %r1, [surf0];
diff --git a/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll b/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll
index 697eb90fb1740..526355247c009 100644
--- a/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll
+++ b/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll
@@ -24,9 +24,9 @@
 define %s_i8i16p @test_s_i8i16p(%s_i8i16p %a) {
 ; CHECK-LABEL: test_s_i8i16p(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<13>;
+; CHECK-NEXT:    .reg .b16 %rs<9>;
 ; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b64 %rd<4>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_s_i8i16p_param_0];
@@ -45,14 +45,14 @@ define %s_i8i16p @test_s_i8i16p(%s_i8i16p %a) {
 ; CHECK-NEXT:    ld.param.b8 %rs4, [retval0+4];
 ; CHECK-NEXT:    ld.param.b8 %rs5, [retval0+3];
 ; CHECK-NEXT:    } // callseq 0
-; CHECK-NEXT:    shl.b16 %rs8, %rs4, 8;
-; CHECK-NEXT:    or.b16 %rs9, %rs8, %rs5;
+; CHECK-NEXT:    shl.b16 %rs6, %rs4, 8;
+; CHECK-NEXT:    or.b16 %rs7, %rs6, %rs5;
 ; CHECK-NEXT:    st.param.b8 [func_retval0+3], %rs5;
 ; CHECK-NEXT:    st.param.b64 [func_retval0+8], %rd2;
 ; CHECK-NEXT:    st.param.b8 [func_retval0+2], %rs2;
 ; CHECK-NEXT:    st.param.b16 [func_retval0], %rs3;
-; CHECK-NEXT:    shr.u16 %rs12, %rs9, 8;
-; CHECK-NEXT:    st.param.b8 [func_retval0+4], %rs12;
+; CHECK-NEXT:    shr.u16 %rs8, %rs7, 8;
+; CHECK-NEXT:    st.param.b8 [func_retval0+4], %rs8;
 ; CHECK-NEXT:    ret;
   %r = tail call %s_i8i16p @test_s_i8i16p(%s_i8i16p %a)
   ret %s_i8i16p %r
@@ -62,9 +62,9 @@ define %s_i8i16p @test_s_i8i16p(%s_i8i16p %a) {
 define %s_i8i32p @test_s_i8i32p(%s_i8i32p %a) {
 ; CHECK-LABEL: test_s_i8i32p(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<4>;
-; CHECK-NEXT:    .reg .b32 %r<24>;
-; CHECK-NEXT:    .reg .b64 %rd<4>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<19>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_s_i8i32p_param_0];
@@ -91,22 +91,22 @@ define %s_i8i32p @test_s_i8i32p(%s_i8i32p %a) {
 ; CHECK-NEXT:    ld.param.b8 %r8, [retval0+6];
 ; CHECK-NEXT:    ld.param.b8 %r9, [retval0+5];
 ; CHECK-NEXT:    } // callseq 1
-; CHECK-NEXT:    shl.b32 %r12, %r8, 8;
-; CHECK-NEXT:    or.b32 %r13, %r12, %r9;
-; CHECK-NEXT:    shl.b32 %r15, %r7, 16;
-; CHECK-NEXT:    shl.b32 %r17, %r6, 24;
-; CHECK-NEXT:    or.b32 %r18, %r17, %r15;
-; CHECK-NEXT:    or.b32 %r19, %r18, %r13;
+; CHECK-NEXT:    shl.b32 %r10, %r8, 8;
+; CHECK-NEXT:    or.b32 %r11, %r10, %r9;
+; CHECK-NEXT:    shl.b32 %r12, %r7, 16;
+; CHECK-NEXT:    shl.b32 %r13, %r6, 24;
+; CHECK-NEXT:    or.b32 %r14, %r13, %r12;
+; CHECK-NEXT:    or.b32 %r15, %r14, %r11;
 ; CHECK-NEXT:    st.param.b8 [func_retval0+5], %r9;
 ; CHECK-NEXT:    st.param.b64 [func_retval0+16], %rd2;
 ; CHECK-NEXT:    st.param.b8 [func_retval0+4], %rs2;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r5;
-; CHECK-NEXT:    shr.u32 %r21, %r19, 24;
-; CHECK-NEXT:    st.param.b8 [func_retval0+8], %r21;
-; CHECK-NEXT:    shr.u32 %r22, %r19, 16;
-; CHECK-NEXT:    st.param.b8 [func_retval0+7], %r22;
-; CHECK-NEXT:    shr.u32 %r23, %r19, 8;
-; CHECK-NEXT:    st.param.b8 [func_retval0+6], %r23;
+; CHECK-NEXT:    shr.u32 %r16, %r15, 24;
+; CHECK-NEXT:    st.param.b8 [func_retval0+8], %r16;
+; CHECK-NEXT:    shr.u32 %r17, %r15, 16;
+; CHECK-NEXT:    st.param.b8 [func_retval0+7], %r17;
+; CHECK-NEXT:    shr.u32 %r18, %r15, 8;
+; CHECK-NEXT:    st.param.b8 [func_retval0+6], %r18;
 ; CHECK-NEXT:    ret;
   %r = tail call %s_i8i32p @test_s_i8i32p(%s_i8i32p %a)
   ret %s_i8i32p %r
@@ -116,8 +116,8 @@ define %s_i8i32p @test_s_i8i32p(%s_i8i32p %a) {
 define %s_i8i64p @test_s_i8i64p(%s_i8i64p %a) {
 ; CHECK-LABEL: test_s_i8i64p(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<3>;
-; CHECK-NEXT:    .reg .b64 %rd<46>;
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b64 %rd<36>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [test_s_i8i64p_param_0];
@@ -144,38 +144,38 @@ define %s_i8i64p @test_s_i8i64p(%s_i8i64p %a) {
 ; CHECK-NEXT:    ld.param.b8 %rd13, [retval0+10];
 ; CHECK-NEXT:    ld.param.b8 %rd14, [retval0+9];
 ; CHECK-NEXT:    } // callseq 2
-; CHECK-NEXT:    shl.b64 %rd17, %rd13, 8;
-; CHECK-NEXT:    or.b64 %rd18, %rd17, %rd14;
-; CHECK-NEXT:    shl.b64 %rd20, %rd12, 16;
-; CHECK-NEXT:    shl.b64 %rd22, %rd11, 24;
-; CHECK-NEXT:    or.b64 %rd23, %rd22, %rd20;
-; CHECK-NEXT:    or.b64 %rd24, %rd23, %rd18;
-; CHECK-NEXT:    shl.b64 %rd27, %rd9, 8;
-; CHECK-NEXT:    or.b64 %rd28, %rd27, %rd10;
-; CHECK-NEXT:    shl.b64 %rd30, %rd8, 16;
-; CHECK-NEXT:    shl.b64 %rd32, %rd7, 24;
-; CHECK-NEXT:    or.b64 %rd33, %rd32, %rd30;
-; CHECK-NEXT:    or.b64 %rd34, %rd33, %rd28;
-; CHECK-NEXT:    shl.b64 %rd35, %rd34, 32;
-; CHECK-NEXT:    or.b64 %rd36, %rd35, %rd24;
+; CHECK-NEXT:    shl.b64 %rd15, %rd13, 8;
+; CHECK-NEXT:    or.b64 %rd16, %rd15, %rd14;
+; CHECK-NEXT:    shl.b64 %rd17, %rd12, 16;
+; CHECK-NEXT:    shl.b64 %rd18, %rd11, 24;
+; CHECK-NEXT:    or.b64 %rd19, %rd18, %rd17;
+; CHECK-NEXT:    or.b64 %rd20, %rd19, %rd16;
+; CHECK-NEXT:    shl.b64 %rd21, %rd9, 8;
+; CHECK-NEXT:    or.b64 %rd22, %rd21, %rd10;
+; CHECK-NEXT:    shl.b64 %rd23, %rd8, 16;
+; CHECK-NEXT:    shl.b64 %rd24, %rd7, 24;
+; CHECK-NEXT:    or.b64 %rd25, %rd24, %rd23;
+; CHECK-NEXT:    or.b64 %rd26, %rd25, %rd22;
+; CHECK-NEXT:    shl.b64 %rd27, %rd26, 32;
+; CHECK-NEXT:    or.b64 %rd28, %rd27, %rd20;
 ; CHECK-NEXT:    st.param.b8 [func_retval0+9], %rd14;
 ; CHECK-NEXT:    st.param.b64 [func_retval0+24], %rd5;
 ; CHECK-NEXT:    st.param.b8 [func_retval0+8], %rs1;
 ; CHECK-NEXT:    st.param.b64 [func_retval0], %rd6;
-; CHECK-NEXT:    shr.u64 %rd39, %rd36, 56;
-; CHECK-NEXT:    st.param.b8 [func_retval0+16], %rd39;
-; CHECK-NEXT:    shr.u64 %rd40, %rd36, 48;
-; CHECK-NEXT:    st.param.b8 [func_retval0+15], %rd40;
-; CHECK-NEXT:    shr.u64 %rd41, %rd36, 40;
-; CHECK-NEXT:    st.param.b8 [func_retval0+14], %rd41;
-; CHECK-NEXT:    shr.u64 %rd42, %rd36, 32;
-; CHECK-NEXT:    st.param.b8 [func_retval0+13], %rd42;
-; CHECK-NEXT:    shr.u64 %rd43, %rd36, 24;
-; CHECK-NEXT:    st.param.b8 [func_retval0+12], %rd43;
-; CHECK-NEXT:    shr.u64 %rd44, %rd36, 16;
-; CHECK-NEXT:    st.param.b8 [func_retval0+11], %rd44;
-; CHECK-NEXT:    shr.u64 %rd45, %rd36, 8;
-; CHECK-NEXT:    st.param.b8 [func_retval0+10], %rd45;
+; CHECK-NEXT:    shr.u64 %rd29, %rd28, 56;
+; CHECK-NEXT:    st.param.b8 [func_retval0+16], %rd29;
+; CHECK-NEXT:    shr.u64 %rd30, %rd28, 48;
+; CHECK-NEXT:    st.param.b8 [func_retval0+15], %rd30;
+; CHECK-NEXT:    shr.u64 %rd31, %rd28, 40;
+; CHECK-NEXT:    st.param.b8 [func_retval0+14], %rd31;
+; CHECK-NEXT:    shr.u64 %rd32, %rd28, 32;
+; CHECK-NEXT:    st.param.b8 [func_retval0+13], %rd32;
+; CHECK-NEXT:    shr.u64 %rd33, %rd28, 24;
+; CHECK-NEXT:    st.param.b8 [func_retval0+12], %rd33;
+; CHECK-NEXT:    shr.u64 %rd34, %rd28, 16;
+; CHECK-NEXT:    st.param.b8 [func_retval0+11], %rd34;
+; CHECK-NEXT:    shr.u64 %rd35, %rd28, 8;
+; CHECK-NEXT:    st.param.b8 [func_retval0+10], %rd35;
 ; CHECK-NEXT:    ret;
   %r = tail call %s_i8i64p @test_s_i8i64p(%s_i8i64p %a)
   ret %s_i8i64p %r
@@ -185,8 +185,8 @@ define %s_i8i64p @test_s_i8i64p(%s_i8i64p %a) {
 define %s_i8f16p @test_s_i8f16p(%s_i8f16p %a) {
 ; CHECK-LABEL: test_s_i8f16p(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<15>;
-; CHECK-NEXT:    .reg .b64 %rd<4>;
+; CHECK-NEXT:    .reg .b16 %rs<11>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b16 %rs1, [test_s_i8f16p_param_0];
@@ -207,14 +207,14 @@ define %s_i8f16p @test_s_i8f16p(%s_i8f16p %a) {
 ; CHECK-NEXT:    ld.param.b8 %rs6, [retval0+4];
 ; CHECK-NEXT:    ld.param.b8 %rs7, [retval0+3];
 ; CHECK-NEXT:    } // callseq 3
-; CHECK-NEXT:    shl.b16 %rs10, %rs6, 8;
-; CHECK-NEXT:    or.b16 %rs11, %rs10, %rs7;
+; CHECK-NEXT:    shl.b16 %rs8, %rs6, 8;
+; CHECK-NEXT:    or.b16 %rs9, %rs8, %rs7;
 ; CHECK-NEXT:    st.param.b8 [func_retval0+3], %rs7;
 ; CHECK-NEXT:    st.param.b64 [func_retval0+8], %rd2;
 ; CHECK-NEXT:    st.param.b8 [func_retval0+2], %rs4;
 ; CHECK-NEXT:    st.param.b16 [func_retval0], %rs5;
-; CHECK-NEXT:    shr.u16 %rs14, %rs11, 8;
-; CHECK-NEXT:    st.param.b8 [func_retval0+4], %rs14;
+; CHECK-NEXT:    shr.u16 %rs10, %rs9, 8;
+; CHECK-NEXT:    st.param.b8 [func_retval0+4], %rs10;
 ; CHECK-NEXT:    ret;
   %r = tail call %s_i8f16p @test_s_i8f16p(%s_i8f16p %a)
   ret %s_i8f16p %r
@@ -224,9 +224,9 @@ define %s_i8f16p @test_s_i8f16p(%s_i8f16p %a) {
 define %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a) {
 ; CHECK-LABEL: test_s_i8f16x2p(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<4>;
-; CHECK-NEXT:    .reg .b32 %r<24>;
-; CHECK-NEXT:    .reg .b64 %rd<4>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<19>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_s_i8f16x2p_param_0];
@@ -253,22 +253,22 @@ define %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a) {
 ; CHECK-NEXT:    ld.param.b8 %r8, [retval0+6];
 ; CHECK-NEXT:    ld.param.b8 %r9, [retval0+5];
 ; CHECK-NEXT:    } // callseq 4
-; CHECK-NEXT:    shl.b32 %r12, %r8, 8;
-; CHECK-NEXT:    or.b32 %r13, %r12, %r9;
-; CHECK-NEXT:    shl.b32 %r15, %r7, 16;
-; CHECK-NEXT:    shl.b32 %r17, %r6, 24;
-; CHECK-NEXT:    or.b32 %r18, %r17, %r15;
-; CHECK-NEXT:    or.b32 %r19, %r18, %r13;
+; CHECK-NEXT:    shl.b32 %r10, %r8, 8;
+; CHECK-NEXT:    or.b32 %r11, %r10, %r9;
+; CHECK-NEXT:    shl.b32 %r12, %r7, 16;
+; CHECK-NEXT:    shl.b32 %r13, %r6, 24;
+; CHECK-NEXT:    or.b32 %r14, %r13, %r12;
+; CHECK-NEXT:    or.b32 %r15, %r14, %r11;
 ; CHECK-NEXT:    st.param.b8 [func_retval0+5], %r9;
 ; CHECK-NEXT:    st.param.b64 [func_retval0+16], %rd2;
 ; CHECK-NEXT:    st.param.b8 [func_retval0+4], %rs2;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r5;
-; CHECK-NEXT:    shr.u32 %r21, %r19, 24;
-; CHECK-NEXT:    st.param.b8 [func_retval0+8], %r21;
-; CHECK-NEXT:    shr.u32 %r22, %r19, 16;
-; CHECK-NEXT:    st.param.b8 [func_retval0+7], %r22;
-; CHECK-NEXT:    shr.u32 %r23, %r19, 8;
-; CHECK-NEXT:    st.param.b8 [func_retval0+6], %r23;
+; CHECK-NEXT:    shr.u32 %r16, %r15, 24;
+; CHECK-NEXT:    st.param.b8 [func_retval0+8], %r16;
+; CHECK-NEXT:    shr.u32 %r17, %r15, 16;
+; CHECK-NEXT:    st.param.b8 [func_retval0+7], %r17;
+; CHECK-NEXT:    shr.u32 %r18, %r15, 8;
+; CHECK-NEXT:    st.param.b8 [func_retval0+6], %r18;
 ; CHECK-NEXT:    ret;
   %r = tail call %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a)
   ret %s_i8f16x2p %r
@@ -278,9 +278,9 @@ define %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a) {
 define %s_i8f32p @test_s_i8f32p(%s_i8f32p %a) {
 ; CHECK-LABEL: test_s_i8f32p(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<4>;
-; CHECK-NEXT:    .reg .b32 %r<24>;
-; CHECK-NEXT:    .reg .b64 %rd<4>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<19>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_s_i8f32p_param_0];
@@ -307,22 +307,22 @@ define %s_i8f32p @test_s_i8f32p(%s_i8f32p %a) {
 ; CHECK-NEXT:    ld.param.b8 %r8, [retval0+6];
 ; CHECK-NEXT:    ld.param.b8 %r9, [retval0+5];
 ; CHECK-NEXT:    } // callseq 5
-; CHECK-NEXT:    shl.b32 %r12, %r8, 8;
-; CHECK-NEXT:    or.b32 %r13, %r12, %r9;
-; CHECK-NEXT:    shl.b32 %r15, %r7, 16;
-; CHECK-NEXT:    shl.b32 %r17, %r6, 24;
-; CHECK-NEXT:    or.b32 %r18, %r17, %r15;
-; CHECK-NEXT:    or.b32 %r19, %r18, %r13;
+; CHECK-NEXT:    shl.b32 %r10, %r8, 8;
+; CHECK-NEXT:    or.b32 %r11, %r10, %r9;
+; CHECK-NEXT:    shl.b32 %r12, %r7, 16;
+; CHECK-NEXT:    shl.b32 %r13, %r6, 24;
+; CHECK-NEXT:    or.b32 %r14, %r13, %r12;
+; CHECK-NEXT:    or.b32 %r15, %r14, %r11;
 ; CHECK-NEXT:    st.param.b8 [func_retval0+5], %r9;
 ; CHECK-NEXT:    st.param.b64 [func_retval0+16], %rd2;
 ; CHECK-NEXT:    st.param.b8 [func_retval0+4], %rs2;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r5;
-; CHECK-NEXT:    shr.u32 %r21, %r19, 24;
-; CHECK-NEXT:    st.param.b8 [func_retval0+8], %r21;
-; CHECK-NEXT:    shr.u32 %r22, %r19, 16;
-; CHECK-NEXT:    st.param.b8 [func_retval0+7], %r22;
-; CHECK-NEXT:    shr.u32 %r23, %r19, 8;
-; CHECK-NEXT:    st.param.b8 [func_retval0+6], %r23;
+; CHECK-NEXT:    shr.u32 %r16, %r15, 24;
+; CHECK-NEXT:    st.param.b8 [func_retval0+8], %r16;
+; CHECK-NEXT:    shr.u32 %r17, %r15, 16;
+; CHECK-NEXT:    st.param.b8 [func_retval0+7], %r17;
+; CHECK-NEXT:    shr.u32 %r18, %r15, 8;
+; CHECK-NEXT:    st.param.b8 [func_retval0+6], %r18;
 ; CHECK-NEXT:    ret;
   %r = tail call %s_i8f32p @test_s_i8f32p(%s_i8f32p %a)
   ret %s_i8f32p %r
@@ -332,8 +332,8 @@ define %s_i8f32p @test_s_i8f32p(%s_i8f32p %a) {
 define %s_i8f64p @test_s_i8f64p(%s_i8f64p %a) {
 ; CHECK-LABEL: test_s_i8f64p(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<3>;
-; CHECK-NEXT:    .reg .b64 %rd<46>;
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b64 %rd<36>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [test_s_i8f64p_param_0];
@@ -360,38 +360,38 @@ define %s_i8f64p @test_s_i8f64p(%s_i8f64p %a) {
 ; CHECK-NEXT:    ld.param.b8 %rd13, [retval0+10];
 ; CHECK-NEXT:    ld.param.b8 %rd14, [retval0+9];
 ; CHECK-NEXT:    } // callseq 6
-; CHECK-NEXT:    shl.b64 %rd17, %rd13, 8;
-; CHECK-NEXT:    or.b64 %rd18, %rd17, %rd14;
-; CHECK-NEXT:    shl.b64 %rd20, %rd12, 16;
-; CHECK-NEXT:    shl.b64 %rd22, %rd11, 24;
-; CHECK-NEXT:    or.b64 %rd23, %rd22, %rd20;
-; CHECK-NEXT:    or.b64 %rd24, %rd23, %rd18;
-; CHECK-NEXT:    shl.b64 %rd27, %rd9, 8;
-; CHECK-NEXT:    or.b64 %rd28, %rd27, %rd10;
-; CHECK-NEXT:    shl.b64 %rd30, %rd8, 16;
-; CHECK-NEXT:    shl.b64 %rd32, %rd7, 24;
-; CHECK-NEXT:    or.b64 %rd33, %rd32, %rd30;
-; CHECK-NEXT:    or.b64 %rd34, %rd33, %rd28;
-; CHECK-NEXT:    shl.b64 %rd35, %rd34, 32;
-; CHECK-NEXT:    or.b64 %rd36, %rd35, %rd24;
+; CHECK-NEXT:    shl.b64 %rd15, %rd13, 8;
+; CHECK-NEXT:    or.b64 %rd16, %rd15, %rd14;
+; CHECK-NEXT:    shl.b64 %rd17, %rd12, 16;
+; CHECK-NEXT:    shl.b64 %rd18, %rd11, 24;
+; CHECK-NEXT:    or.b64 %rd19, %rd18, %rd17;
+; CHECK-NEXT:    or.b64 %rd20, %rd19, %rd16;
+; CHECK-NEXT:    shl.b64 %rd21, %rd9, 8;
+; CHECK-NEXT:    or.b64 %rd22, %rd21, %rd10;
+; CHECK-NEXT:    shl.b64 %rd23, %rd8, 16;
+; CHECK-NEXT:    shl.b64 %rd24, %rd7, 24;
+; CHECK-NEXT:    or.b64 %rd25, %rd24, %rd23;
+; CHECK-NEXT:    or.b64 %rd26, %rd25, %rd22;
+; CHECK-NEXT:    shl.b64 %rd27, %rd26, 32;
+; CHECK-NEXT:    or.b64 %rd28, %rd27, %rd20;
 ; CHECK-NEXT:    st.param.b8 [func_retval0+9], %rd14;
 ; CHECK-NEXT:    st.param.b64 [func_retval0+24], %rd5;
 ; CHECK-NEXT:    st.param.b8 [func_retval0+8], %rs1;
 ; CHECK-NEXT:    st.param.b64 [func_retval0], %rd6;
-; CHECK-NEXT:    shr.u64 %rd39, %rd36, 56;
-; CHECK-NEXT:    st.param.b8 [func_retval0+16], %rd39;
-; CHECK-NEXT:    shr.u64 %rd40, %rd36, 48;
-; CHECK-NEXT:    st.param.b8 [func_retval0+15], %rd40;
-; CHECK-NEXT:    shr.u64 %rd41, %rd36, 40;
-; CHECK-NEXT:    st.param.b8 [func_retval0+14], %rd41;
-; CHECK-NEXT:    shr.u64 %rd42, %rd36, 32;
-; CHECK-NEXT:    st.param.b8 [func_retval0+13], %rd42;
-; CHECK-NEXT:    shr.u64 %rd43, %rd36, 24;
-; CHECK-NEXT:    st.param.b8 [func_retval0+12], %rd43;
-; CHECK-NEXT:    shr.u64 %rd44, %rd36, 16;
-; CHECK-NEXT:    st.param.b8 [func_retval0+11], %rd44;
-; CHECK-NEXT:    shr.u64 %rd45, %rd36, 8;
-; CHECK-NEXT:    st.param.b8 [func_retval0+10], %rd45;
+; CHECK-NEXT:    shr.u64 %rd29, %rd28, 56;
+; CHECK-NEXT:    st.param.b8 [func_retval0+16], %rd29;
+; CHECK-NEXT:    shr.u64 %rd30, %rd28, 48;
+; CHECK-NEXT:    st.param.b8 [func_retval0+15], %rd30;
+; CHECK-NEXT:    shr.u64 %rd31, %rd28, 40;
+; CHECK-NEXT:    st.param.b8 [func_retval0+14], %rd31;
+; CHECK-NEXT:    shr.u64 %rd32, %rd28, 32;
+; CHECK-NEXT:    st.param.b8 [func_retval0+13], %rd32;
+; CHECK-NEXT:    shr.u64 %rd33, %rd28, 24;
+; CHECK-NEXT:    st.param.b8 [func_retval0+12], %rd33;
+; CHECK-NEXT:    shr.u64 %rd34, %rd28, 16;
+; CHECK-NEXT:    st.param.b8 [func_retval0+11], %rd34;
+; CHECK-NEXT:    shr.u64 %rd35, %rd28, 8;
+; CHECK-NEXT:    st.param.b8 [func_retval0+10], %rd35;
 ; CHECK-NEXT:    ret;
   %r = tail call %s_i8f64p @test_s_i8f64p(%s_i8f64p %a)
   ret %s_i8f64p %r
diff --git a/llvm/test/CodeGen/NVPTX/variadics-backend.ll b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
index a9b3675b67155..890753b6ac5aa 100644
--- a/llvm/test/CodeGen/NVPTX/variadics-backend.ll
+++ b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
@@ -104,7 +104,7 @@ define dso_local i32 @foo() {
 ; CHECK-PTX-NEXT:    .local .align 8 .b8 __local_depot1[40];
 ; CHECK-PTX-NEXT:    .reg .b64 %SP;
 ; CHECK-PTX-NEXT:    .reg .b64 %SPL;
-; CHECK-PTX-NEXT:    .reg .b32 %r<3>;
+; CHECK-PTX-NEXT:    .reg .b32 %r<2>;
 ; CHECK-PTX-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-PTX-EMPTY:
 ; CHECK-PTX-NEXT:  // %bb.0: // %entry
@@ -143,29 +143,29 @@ define dso_local i32 @variadics2(i32 noundef %first, ...) {
 ; CHECK-PTX-NEXT:    .reg .b64 %SPL;
 ; CHECK-PTX-NEXT:    .reg .b16 %rs<4>;
 ; CHECK-PTX-NEXT:    .reg .b32 %r<6>;
-; CHECK-PTX-NEXT:    .reg .b64 %rd<9>;
+; CHECK-PTX-NEXT:    .reg .b64 %rd<8>;
 ; CHECK-PTX-EMPTY:
 ; CHECK-PTX-NEXT:  // %bb.0: // %entry
 ; CHECK-PTX-NEXT:    mov.b64 %SPL, __local_depot2;
 ; CHECK-PTX-NEXT:    ld.param.b32 %r1, [variadics2_param_0];
 ; CHECK-PTX-NEXT:    ld.param.b64 %rd1, [variadics2_param_1];
-; CHECK-PTX-NEXT:    add.u64 %rd3, %SPL, 0;
-; CHECK-PTX-NEXT:    add.s64 %rd4, %rd1, 7;
-; CHECK-PTX-NEXT:    and.b64 %rd5, %rd4, -8;
-; CHECK-PTX-NEXT:    ld.b32 %r2, [%rd5];
-; CHECK-PTX-NEXT:    ld.s8 %r3, [%rd5+4];
-; CHECK-PTX-NEXT:    ld.b8 %rs1, [%rd5+7];
-; CHECK-PTX-NEXT:    st.local.b8 [%rd3+2], %rs1;
-; CHECK-PTX-NEXT:    ld.b8 %rs2, [%rd5+6];
-; CHECK-PTX-NEXT:    st.local.b8 [%rd3+1], %rs2;
-; CHECK-PTX-NEXT:    ld.b8 %rs3, [%rd5+5];
-; CHECK-PTX-NEXT:    st.local.b8 [%rd3], %rs3;
-; CHECK-PTX-NEXT:    ld.b64 %rd6, [%rd5+8];
+; CHECK-PTX-NEXT:    add.u64 %rd2, %SPL, 0;
+; CHECK-PTX-NEXT:    add.s64 %rd3, %rd1, 7;
+; CHECK-PTX-NEXT:    and.b64 %rd4, %rd3, -8;
+; CHECK-PTX-NEXT:    ld.b32 %r2, [%rd4];
+; CHECK-PTX-NEXT:    ld.s8 %r3, [%rd4+4];
+; CHECK-PTX-NEXT:    ld.b8 %rs1, [%rd4+7];
+; CHECK-PTX-NEXT:    st.local.b8 [%rd2+2], %rs1;
+; CHECK-PTX-NEXT:    ld.b8 %rs2, [%rd4+6];
+; CHECK-PTX-NEXT:    st.local.b8 [%rd2+1], %rs2;
+; CHECK-PTX-NEXT:    ld.b8 %rs3, [%rd4+5];
+; CHECK-PTX-NEXT:    st.local.b8 [%rd2], %rs3;
+; CHECK-PTX-NEXT:    ld.b64 %rd5, [%rd4+8];
 ; CHECK-PTX-NEXT:    add.s32 %r4, %r1, %r2;
 ; CHECK-PTX-NEXT:    add.s32 %r5, %r4, %r3;
-; CHECK-PTX-NEXT:    cvt.u64.u32 %rd7, %r5;
-; CHECK-PTX-NEXT:    add.s64 %rd8, %rd7, %rd6;
-; CHECK-PTX-NEXT:    st.param.b32 [func_retval0], %rd8;
+; CHECK-PTX-NEXT:    cvt.u64.u32 %rd6, %r5;
+; CHECK-PTX-NEXT:    add.s64 %rd7, %rd6, %rd5;
+; CHECK-PTX-NEXT:    st.param.b32 [func_retval0], %rd7;
 ; CHECK-PTX-NEXT:    ret;
 entry:
   %vlist = alloca ptr, align 8
@@ -202,19 +202,19 @@ define dso_local i32 @bar() {
 ; CHECK-PTX-NEXT:    .reg .b64 %SP;
 ; CHECK-PTX-NEXT:    .reg .b64 %SPL;
 ; CHECK-PTX-NEXT:    .reg .b16 %rs<4>;
-; CHECK-PTX-NEXT:    .reg .b32 %r<3>;
-; CHECK-PTX-NEXT:    .reg .b64 %rd<4>;
+; CHECK-PTX-NEXT:    .reg .b32 %r<2>;
+; CHECK-PTX-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-PTX-EMPTY:
 ; CHECK-PTX-NEXT:  // %bb.0: // %entry
 ; CHECK-PTX-NEXT:    mov.b64 %SPL, __local_depot3;
 ; CHECK-PTX-NEXT:    cvta.local.u64 %SP, %SPL;
-; CHECK-PTX-NEXT:    add.u64 %rd2, %SPL, 0;
+; CHECK-PTX-NEXT:    add.u64 %rd1, %SPL, 0;
 ; CHECK-PTX-NEXT:    ld.global.nc.b8 %rs1, [__const_$_bar_$_s1+7];
-; CHECK-PTX-NEXT:    st.local.b8 [%rd2+2], %rs1;
+; CHECK-PTX-NEXT:    st.local.b8 [%rd1+2], %rs1;
 ; CHECK-PTX-NEXT:    ld.global.nc.b8 %rs2, [__const_$_bar_$_s1+6];
-; CHECK-PTX-NEXT:    st.local.b8 [%rd2+1], %rs2;
+; CHECK-PTX-NEXT:    st.local.b8 [%rd1+1], %rs2;
 ; CHECK-PTX-NEXT:    ld.global.nc.b8 %rs3, [__const_$_bar_$_s1+5];
-; CHECK-PTX-NEXT:    st.local.b8 [%rd2], %rs3;
+; CHECK-PTX-NEXT:    st.local.b8 [%rd1], %rs3;
 ; CHECK-PTX-NEXT:    st.b32 [%SP+8], 1;
 ; CHECK-PTX-NEXT:    st.b8 [%SP+12], 1;
 ; CHECK-PTX-NEXT:    st.b64 [%SP+16], 1;
@@ -222,8 +222,8 @@ define dso_local i32 @bar() {
 ; CHECK-PTX-NEXT:    .param .b32 param0;
 ; CHECK-PTX-NEXT:    .param .b64 param1;
 ; CHECK-PTX-NEXT:    .param .b32 retval0;
-; CHECK-PTX-NEXT:    add.u64 %rd3, %SP, 8;
-; CHECK-PTX-NEXT:    st.param.b64 [param1], %rd3;
+; CHECK-PTX-NEXT:    add.u64 %rd2, %SP, 8;
+; CHECK-PTX-NEXT:    st.param.b64 [param1], %rd2;
 ; CHECK-PTX-NEXT:    st.param.b32 [param0], 1;
 ; CHECK-PTX-NEXT:    call.uni (retval0), variadics2, (param0, param1);
 ; CHECK-PTX-NEXT:    ld.param.b32 %r1, [retval0];
@@ -282,7 +282,7 @@ define dso_local i32 @baz() {
 ; CHECK-PTX-NEXT:    .local .align 16 .b8 __local_depot5[16];
 ; CHECK-PTX-NEXT:    .reg .b64 %SP;
 ; CHECK-PTX-NEXT:    .reg .b64 %SPL;
-; CHECK-PTX-NEXT:    .reg .b32 %r<3>;
+; CHECK-PTX-NEXT:    .reg .b32 %r<2>;
 ; CHECK-PTX-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-PTX-EMPTY:
 ; CHECK-PTX-NEXT:  // %bb.0: // %entry
@@ -309,18 +309,18 @@ entry:
 define dso_local i32 @variadics4(ptr noundef byval(%struct.S2) align 8 %first, ...) {
 ; CHECK-PTX-LABEL: variadics4(
 ; CHECK-PTX:       {
-; CHECK-PTX-NEXT:    .reg .b64 %rd<10>;
+; CHECK-PTX-NEXT:    .reg .b64 %rd<9>;
 ; CHECK-PTX-EMPTY:
 ; CHECK-PTX-NEXT:  // %bb.0: // %entry
-; CHECK-PTX-NEXT:    ld.param.b64 %rd2, [variadics4_param_1];
-; CHECK-PTX-NEXT:    add.s64 %rd3, %rd2, 7;
-; CHECK-PTX-NEXT:    and.b64 %rd4, %rd3, -8;
-; CHECK-PTX-NEXT:    ld.b64 %rd5, [%rd4];
-; CHECK-PTX-NEXT:    ld.param.b64 %rd6, [variadics4_param_0];
-; CHECK-PTX-NEXT:    ld.param.b64 %rd7, [variadics4_param_0+8];
-; CHECK-PTX-NEXT:    add.s64 %rd8, %rd6, %rd7;
-; CHECK-PTX-NEXT:    add.s64 %rd9, %rd8, %rd5;
-; CHECK-PTX-NEXT:    st.param.b32 [func_retval0], %rd9;
+; CHECK-PTX-NEXT:    ld.param.b64 %rd1, [variadics4_param_1];
+; CHECK-PTX-NEXT:    add.s64 %rd2, %rd1, 7;
+; CHECK-PTX-NEXT:    and.b64 %rd3, %rd2, -8;
+; CHECK-PTX-NEXT:    ld.b64 %rd4, [%rd3];
+; CHECK-PTX-NEXT:    ld.param.b64 %rd5, [variadics4_param_0];
+; CHECK-PTX-NEXT:    ld.param.b64 %rd6, [variadics4_param_0+8];
+; CHECK-PTX-NEXT:    add.s64 %rd7, %rd5, %rd6;
+; CHECK-PTX-NEXT:    add.s64 %rd8, %rd7, %rd4;
+; CHECK-PTX-NEXT:    st.param.b32 [func_retval0], %rd8;
 ; CHECK-PTX-NEXT:    ret;
 entry:
   %vlist = alloca ptr, align 8
@@ -348,27 +348,27 @@ define dso_local void @qux() {
 ; CHECK-PTX-NEXT:    .local .align 8 .b8 __local_depot7[24];
 ; CHECK-PTX-NEXT:    .reg .b64 %SP;
 ; CHECK-PTX-NEXT:    .reg .b64 %SPL;
-; CHECK-PTX-NEXT:    .reg .b64 %rd<8>;
+; CHECK-PTX-NEXT:    .reg .b64 %rd<7>;
 ; CHECK-PTX-EMPTY:
 ; CHECK-PTX-NEXT:  // %bb.0: // %entry
 ; CHECK-PTX-NEXT:    mov.b64 %SPL, __local_depot7;
 ; CHECK-PTX-NEXT:    cvta.local.u64 %SP, %SPL;
-; CHECK-PTX-NEXT:    add.u64 %rd2, %SPL, 0;
-; CHECK-PTX-NEXT:    ld.global.nc.b64 %rd3, [__const_$_qux_$_s+8];
-; CHECK-PTX-NEXT:    st.local.b64 [%rd2+8], %rd3;
-; CHECK-PTX-NEXT:    ld.global.nc.b64 %rd4, [__const_$_qux_$_s];
-; CHECK-PTX-NEXT:    st.local.b64 [%rd2], %rd4;
+; CHECK-PTX-NEXT:    add.u64 %rd1, %SPL, 0;
+; CHECK-PTX-NEXT:    ld.global.nc.b64 %rd2, [__const_$_qux_$_s+8];
+; CHECK-PTX-NEXT:    st.local.b64 [%rd1+8], %rd2;
+; CHECK-PTX-NEXT:    ld.global.nc.b64 %rd3, [__const_$_qux_$_s];
+; CHECK-PTX-NEXT:    st.local.b64 [%rd1], %rd3;
 ; CHECK-PTX-NEXT:    st.b64 [%SP+16], 1;
 ; CHECK-PTX-NEXT:    { // callseq 3, 0
 ; CHECK-PTX-NEXT:    .param .align 8 .b8 param0[16];
 ; CHECK-PTX-NEXT:    .param .b64 param1;
 ; CHECK-PTX-NEXT:    .param .b32 retval0;
-; CHECK-PTX-NEXT:    add.u64 %rd5, %SP, 16;
-; CHECK-PTX-NEXT:    st.param.b64 [param1], %rd5;
-; CHECK-PTX-NEXT:    ld.local.b64 %rd6, [%rd2+8];
-; CHECK-PTX-NEXT:    st.param.b64 [param0+8], %rd6;
-; CHECK-PTX-NEXT:    ld.local.b64 %rd7, [%rd2];
-; CHECK-PTX-NEXT:    st.param.b64 [param0], %rd7;
+; CHECK-PTX-NEXT:    add.u64 %rd4, %SP, 16;
+; CHECK-PTX-NEXT:    st.param.b64 [param1], %rd4;
+; CHECK-PTX-NEXT:    ld.local.b64 %rd5, [%rd1+8];
+; CHECK-PTX-NEXT:    st.param.b64 [param0+8], %rd5;
+; CHECK-PTX-NEXT:    ld.local.b64 %rd6, [%rd1];
+; CHECK-PTX-NEXT:    st.param.b64 [param0], %rd6;
 ; CHECK-PTX-NEXT:    call.uni (retval0), variadics4, (param0, param1);
 ; CHECK-PTX-NEXT:    } // callseq 3
 ; CHECK-PTX-NEXT:    ret;



More information about the llvm-commits mailing list