[llvm] [NVPTX] Remove Float register classes (PR #140487)

Alex MacLean via llvm-commits llvm-commits at lists.llvm.org
Sun May 18 17:32:45 PDT 2025


https://github.com/AlexMaclean created https://github.com/llvm/llvm-project/pull/140487

These classes are redundant, as the untyped "Int" classes can be used for all float operations. This change is intended to be as minimal as possible and leaves the many potential simplifications and refactors this exposes as future work.

>From 124bcf8b8d153041dbe007b9da17e44ccdd7f9f5 Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean at nvidia.com>
Date: Wed, 30 Apr 2025 17:55:21 +0000
Subject: [PATCH 1/2] code updates

---
 llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp   |  4 ----
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 11 ++++-------
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp    | 12 ++----------
 llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp |  8 --------
 llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td  | 12 ++++++------
 5 files changed, 12 insertions(+), 35 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 0e5207cf9b04c..e2e42ff771336 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -223,10 +223,6 @@ unsigned NVPTXAsmPrinter::encodeVirtualRegister(unsigned Reg) {
       Ret = (3 << 28);
     } else if (RC == &NVPTX::Int64RegsRegClass) {
       Ret = (4 << 28);
-    } else if (RC == &NVPTX::Float32RegsRegClass) {
-      Ret = (5 << 28);
-    } else if (RC == &NVPTX::Float64RegsRegClass) {
-      Ret = (6 << 28);
     } else if (RC == &NVPTX::Int128RegsRegClass) {
       Ret = (7 << 28);
     } else {
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 82d00ef8eccb9..9a82db31e43a0 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -596,8 +596,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   addRegisterClass(MVT::v4i8, &NVPTX::Int32RegsRegClass);
   addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
   addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
-  addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
-  addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
+  addRegisterClass(MVT::f32, &NVPTX::Int32RegsRegClass);
+  addRegisterClass(MVT::f64, &NVPTX::Int64RegsRegClass);
   addRegisterClass(MVT::f16, &NVPTX::Int16RegsRegClass);
   addRegisterClass(MVT::v2f16, &NVPTX::Int32RegsRegClass);
   addRegisterClass(MVT::bf16, &NVPTX::Int16RegsRegClass);
@@ -4931,13 +4931,14 @@ NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
     case 'b':
       return std::make_pair(0U, &NVPTX::Int1RegsRegClass);
     case 'c':
-      return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
     case 'h':
       return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
     case 'r':
+    case 'f':
       return std::make_pair(0U, &NVPTX::Int32RegsRegClass);
     case 'l':
     case 'N':
+    case 'd':
       return std::make_pair(0U, &NVPTX::Int64RegsRegClass);
     case 'q': {
       if (STI.getSmVersion() < 70)
@@ -4945,10 +4946,6 @@ NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                            "supported for sm_70 and higher!");
       return std::make_pair(0U, &NVPTX::Int128RegsRegClass);
     }
-    case 'f':
-      return std::make_pair(0U, &NVPTX::Float32RegsRegClass);
-    case 'd':
-      return std::make_pair(0U, &NVPTX::Float64RegsRegClass);
     }
   }
   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
index 67dc7904a91ae..f262a0fb66c25 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
@@ -44,19 +44,11 @@ void NVPTXInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   } else if (DestRC == &NVPTX::Int16RegsRegClass) {
     Op = NVPTX::MOV16r;
   } else if (DestRC == &NVPTX::Int32RegsRegClass) {
-    Op = (SrcRC == &NVPTX::Int32RegsRegClass ? NVPTX::IMOV32r
-                                             : NVPTX::BITCONVERT_32_F2I);
+    Op = NVPTX::IMOV32r;
   } else if (DestRC == &NVPTX::Int64RegsRegClass) {
-    Op = (SrcRC == &NVPTX::Int64RegsRegClass ? NVPTX::IMOV64r
-                                             : NVPTX::BITCONVERT_64_F2I);
+    Op = NVPTX::IMOV64r;
   } else if (DestRC == &NVPTX::Int128RegsRegClass) {
     Op = NVPTX::IMOV128r;
-  } else if (DestRC == &NVPTX::Float32RegsRegClass) {
-    Op = (SrcRC == &NVPTX::Float32RegsRegClass ? NVPTX::FMOV32r
-                                               : NVPTX::BITCONVERT_32_I2F);
-  } else if (DestRC == &NVPTX::Float64RegsRegClass) {
-    Op = (SrcRC == &NVPTX::Float64RegsRegClass ? NVPTX::FMOV64r
-                                               : NVPTX::BITCONVERT_64_I2F);
   } else {
     llvm_unreachable("Bad register copy");
   }
diff --git a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
index 6b9797c3e6aae..eb60e1502cf90 100644
--- a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
@@ -25,10 +25,6 @@ using namespace llvm;
 
 namespace llvm {
 StringRef getNVPTXRegClassName(TargetRegisterClass const *RC) {
-  if (RC == &NVPTX::Float32RegsRegClass)
-    return ".b32";
-  if (RC == &NVPTX::Float64RegsRegClass)
-    return ".b64";
   if (RC == &NVPTX::Int128RegsRegClass)
     return ".b128";
   if (RC == &NVPTX::Int64RegsRegClass)
@@ -63,10 +59,6 @@ StringRef getNVPTXRegClassName(TargetRegisterClass const *RC) {
 }
 
 StringRef getNVPTXRegClassStr(TargetRegisterClass const *RC) {
-  if (RC == &NVPTX::Float32RegsRegClass)
-    return "%f";
-  if (RC == &NVPTX::Float64RegsRegClass)
-    return "%fd";
   if (RC == &NVPTX::Int128RegsRegClass)
     return "%rq";
   if (RC == &NVPTX::Int64RegsRegClass)
diff --git a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
index 2011f0f7e328f..2eea9e9721cdf 100644
--- a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
@@ -40,8 +40,6 @@ foreach i = 0...4 in {
   def RQ#i : NVPTXReg<"%rq"#i>; // 128-bit
   def H#i  : NVPTXReg<"%h"#i>;  // 16-bit float
   def HH#i : NVPTXReg<"%hh"#i>; // 2x16-bit float
-  def F#i  : NVPTXReg<"%f"#i>;  // 32-bit float
-  def FL#i : NVPTXReg<"%fd"#i>; // 64-bit float
 
   // Arguments
   def ia#i : NVPTXReg<"%ia"#i>;
@@ -59,14 +57,13 @@ foreach i = 0...31 in {
 //===----------------------------------------------------------------------===//
 def Int1Regs : NVPTXRegClass<[i1], 8, (add (sequence "P%u", 0, 4))>;
 def Int16Regs : NVPTXRegClass<[i16, f16, bf16], 16, (add (sequence "RS%u", 0, 4))>;
-def Int32Regs : NVPTXRegClass<[i32, v2f16, v2bf16, v2i16, v4i8], 32,
+def Int32Regs : NVPTXRegClass<[i32, v2f16, v2bf16, v2i16, v4i8, f32], 32,
                               (add (sequence "R%u", 0, 4),
                               VRFrame32, VRFrameLocal32)>;
-def Int64Regs : NVPTXRegClass<[i64], 64, (add (sequence "RL%u", 0, 4), VRFrame64, VRFrameLocal64)>;
+def Int64Regs : NVPTXRegClass<[i64, f64], 64, (add (sequence "RL%u", 0, 4), VRFrame64, VRFrameLocal64)>;
 // 128-bit regs are not defined as general regs in NVPTX. They are used for inlineASM only.
 def Int128Regs : NVPTXRegClass<[i128], 128, (add (sequence "RQ%u", 0, 4))>;
-def Float32Regs : NVPTXRegClass<[f32], 32, (add (sequence "F%u", 0, 4))>;
-def Float64Regs : NVPTXRegClass<[f64], 64, (add (sequence "FL%u", 0, 4))>;
+
 def Int32ArgRegs : NVPTXRegClass<[i32], 32, (add (sequence "ia%u", 0, 4))>;
 def Int64ArgRegs : NVPTXRegClass<[i64], 64, (add (sequence "la%u", 0, 4))>;
 def Float32ArgRegs : NVPTXRegClass<[f32], 32, (add (sequence "fa%u", 0, 4))>;
@@ -75,3 +72,6 @@ def Float64ArgRegs : NVPTXRegClass<[f64], 64, (add (sequence "da%u", 0, 4))>;
 // Read NVPTXRegisterInfo.cpp to see how VRFrame and VRDepot are used.
 def SpecialRegs : NVPTXRegClass<[i32], 32, (add VRFrame32, VRFrameLocal32, VRDepot,
                                             (sequence "ENVREG%u", 0, 31))>;
+
+defvar Float32Regs = Int32Regs;
+defvar Float64Regs = Int64Regs;

>From 76be42d9e025d4b9e6d65fbd61ce7ed7845c1825 Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean at nvidia.com>
Date: Wed, 30 Apr 2025 17:55:36 +0000
Subject: [PATCH 2/2] test updates

---
 .../test/CodeGen/NVPTX/LoadStoreVectorizer.ll |  106 +-
 llvm/test/CodeGen/NVPTX/access-non-generic.ll |   20 +-
 llvm/test/CodeGen/NVPTX/aggregate-return.ll   |   16 +-
 llvm/test/CodeGen/NVPTX/and-or-setcc.ll       |   26 +-
 llvm/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll |   16 +-
 llvm/test/CodeGen/NVPTX/atomics-with-scope.ll |    4 +-
 llvm/test/CodeGen/NVPTX/atomics.ll            |   81 +-
 llvm/test/CodeGen/NVPTX/bf16-instructions.ll  | 1082 +++--
 .../NVPTX/bf16x2-instructions-approx.ll       |   30 +-
 .../test/CodeGen/NVPTX/bf16x2-instructions.ll |  253 +-
 llvm/test/CodeGen/NVPTX/bug22322.ll           |    2 +-
 .../CodeGen/NVPTX/call-with-alloca-buffer.ll  |    4 +-
 llvm/test/CodeGen/NVPTX/convert-fp-i8.ll      |   60 +-
 llvm/test/CodeGen/NVPTX/convert-fp.ll         |   52 +-
 llvm/test/CodeGen/NVPTX/convert-sm100.ll      |   36 +-
 llvm/test/CodeGen/NVPTX/convert-sm100a.ll     |  130 +-
 llvm/test/CodeGen/NVPTX/convert-sm80.ll       |  143 +-
 llvm/test/CodeGen/NVPTX/convert-sm90.ll       |   36 +-
 llvm/test/CodeGen/NVPTX/copysign.ll           |   68 +-
 .../NVPTX/distributed-shared-cluster.ll       |    9 +-
 llvm/test/CodeGen/NVPTX/div.ll                |   20 +-
 llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll |   13 +-
 llvm/test/CodeGen/NVPTX/f16-abs.ll            |    8 +-
 llvm/test/CodeGen/NVPTX/f16-instructions.ll   |  256 +-
 llvm/test/CodeGen/NVPTX/f16x2-instructions.ll |  916 ++--
 llvm/test/CodeGen/NVPTX/f32-ex2.ll            |   16 +-
 llvm/test/CodeGen/NVPTX/f32-lg2.ll            |   16 +-
 llvm/test/CodeGen/NVPTX/fabs-intrinsics.ll    |   24 +-
 llvm/test/CodeGen/NVPTX/fexp2.ll              |  249 +-
 llvm/test/CodeGen/NVPTX/flog2.ll              |  152 +-
 llvm/test/CodeGen/NVPTX/fma-assoc.ll          |   48 +-
 llvm/test/CodeGen/NVPTX/fma-relu-contract.ll  |  717 ++-
 .../CodeGen/NVPTX/fma-relu-fma-intrinsic.ll   |  558 ++-
 .../NVPTX/fma-relu-instruction-flag.ll        | 1158 +++--
 llvm/test/CodeGen/NVPTX/fma.ll                |  108 +-
 llvm/test/CodeGen/NVPTX/fp-contract.ll        |   80 +-
 llvm/test/CodeGen/NVPTX/fp-literals.ll        |    4 +-
 llvm/test/CodeGen/NVPTX/frem.ll               |  264 +-
 llvm/test/CodeGen/NVPTX/i1-int-to-fp.ll       |    8 +-
 llvm/test/CodeGen/NVPTX/i8x4-instructions.ll  |   16 +-
 llvm/test/CodeGen/NVPTX/inline-asm.ll         |   31 +-
 llvm/test/CodeGen/NVPTX/intrinsics.ll         |   32 +-
 llvm/test/CodeGen/NVPTX/ld-generic.ll         |  145 +-
 llvm/test/CodeGen/NVPTX/ld-st-addrrspace.py   |    4 +-
 llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll  |   70 +-
 llvm/test/CodeGen/NVPTX/ldg-invariant.ll      |   64 +-
 llvm/test/CodeGen/NVPTX/ldparam-v4.ll         |   22 +-
 llvm/test/CodeGen/NVPTX/ldu-ldg.ll            |   26 +-
 .../load-store-256-addressing-invariant.ll    |   52 +-
 .../NVPTX/load-store-256-addressing.ll        |   52 +-
 llvm/test/CodeGen/NVPTX/load-store-scalars.ll |  544 ++-
 llvm/test/CodeGen/NVPTX/load-store-sm-70.ll   | 3873 ++++++++++++-----
 llvm/test/CodeGen/NVPTX/load-store-sm-90.ll   | 1521 +++++--
 .../CodeGen/NVPTX/load-store-vectors-256.ll   |  214 +-
 llvm/test/CodeGen/NVPTX/load-store-vectors.ll |  280 +-
 llvm/test/CodeGen/NVPTX/math-intrins.ll       | 1082 +++--
 llvm/test/CodeGen/NVPTX/misched_func_call.ll  |   15 +-
 llvm/test/CodeGen/NVPTX/param-add.ll          |   24 +-
 llvm/test/CodeGen/NVPTX/param-load-store.ll   |   16 +-
 llvm/test/CodeGen/NVPTX/param-overalign.ll    |   32 +-
 .../CodeGen/NVPTX/proxy-reg-erasure-ptx.ll    |   14 +-
 llvm/test/CodeGen/NVPTX/rcp-opt.ll            |   30 +-
 .../CodeGen/NVPTX/reduction-intrinsics.ll     |  402 +-
 llvm/test/CodeGen/NVPTX/redux-sync-f32.ll     |   88 +-
 llvm/test/CodeGen/NVPTX/reg-types.ll          |    6 +-
 llvm/test/CodeGen/NVPTX/shfl-p.ll             |  305 +-
 llvm/test/CodeGen/NVPTX/shfl-sync-p.ll        |   32 +-
 llvm/test/CodeGen/NVPTX/shfl.ll               |   10 +-
 llvm/test/CodeGen/NVPTX/st-addrspace.ll       |   24 +-
 llvm/test/CodeGen/NVPTX/st-generic.ll         |    8 +-
 llvm/test/CodeGen/NVPTX/st-param-imm.ll       |  166 +-
 llvm/test/CodeGen/NVPTX/surf-read-cuda.ll     |   14 +-
 llvm/test/CodeGen/NVPTX/surf-read.ll          |    4 +-
 llvm/test/CodeGen/NVPTX/surf-tex.py           |   10 +-
 .../test/CodeGen/NVPTX/tag-invariant-loads.ll |   14 +-
 llvm/test/CodeGen/NVPTX/tex-read-cuda.ll      |   25 +-
 llvm/test/CodeGen/NVPTX/tex-read.ll           |    4 +-
 .../NVPTX/unaligned-param-load-store.ll       |    8 +-
 llvm/test/CodeGen/NVPTX/vaargs.ll             |    4 +-
 llvm/test/CodeGen/NVPTX/variadics-backend.ll  |   23 +-
 llvm/test/CodeGen/NVPTX/vec-param-load.ll     |   20 +-
 llvm/test/CodeGen/NVPTX/vector-args.ll        |    8 +-
 llvm/test/CodeGen/NVPTX/vector-loads.ll       |   74 +-
 llvm/test/CodeGen/NVPTX/wmma.py               |    4 +-
 84 files changed, 9024 insertions(+), 7177 deletions(-)

diff --git a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
index 78b57badc06e8..1207c429524ca 100644
--- a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
+++ b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
@@ -45,36 +45,36 @@ define half @fh(ptr %p) {
 ; ENABLED-LABEL: fh(
 ; ENABLED:       {
 ; ENABLED-NEXT:    .reg .b16 %rs<10>;
-; ENABLED-NEXT:    .reg .b32 %f<13>;
+; ENABLED-NEXT:    .reg .b32 %r<13>;
 ; ENABLED-NEXT:    .reg .b64 %rd<2>;
 ; ENABLED-EMPTY:
 ; ENABLED-NEXT:  // %bb.0:
 ; ENABLED-NEXT:    ld.param.b64 %rd1, [fh_param_0];
 ; ENABLED-NEXT:    ld.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
 ; ENABLED-NEXT:    ld.b16 %rs5, [%rd1+8];
-; ENABLED-NEXT:    cvt.f32.f16 %f1, %rs2;
-; ENABLED-NEXT:    cvt.f32.f16 %f2, %rs1;
-; ENABLED-NEXT:    add.rn.f32 %f3, %f2, %f1;
-; ENABLED-NEXT:    cvt.rn.f16.f32 %rs6, %f3;
-; ENABLED-NEXT:    cvt.f32.f16 %f4, %rs4;
-; ENABLED-NEXT:    cvt.f32.f16 %f5, %rs3;
-; ENABLED-NEXT:    add.rn.f32 %f6, %f5, %f4;
-; ENABLED-NEXT:    cvt.rn.f16.f32 %rs7, %f6;
-; ENABLED-NEXT:    cvt.f32.f16 %f7, %rs7;
-; ENABLED-NEXT:    cvt.f32.f16 %f8, %rs6;
-; ENABLED-NEXT:    add.rn.f32 %f9, %f8, %f7;
-; ENABLED-NEXT:    cvt.rn.f16.f32 %rs8, %f9;
-; ENABLED-NEXT:    cvt.f32.f16 %f10, %rs8;
-; ENABLED-NEXT:    cvt.f32.f16 %f11, %rs5;
-; ENABLED-NEXT:    add.rn.f32 %f12, %f10, %f11;
-; ENABLED-NEXT:    cvt.rn.f16.f32 %rs9, %f12;
+; ENABLED-NEXT:    cvt.f32.f16 %r1, %rs2;
+; ENABLED-NEXT:    cvt.f32.f16 %r2, %rs1;
+; ENABLED-NEXT:    add.rn.f32 %r3, %r2, %r1;
+; ENABLED-NEXT:    cvt.rn.f16.f32 %rs6, %r3;
+; ENABLED-NEXT:    cvt.f32.f16 %r4, %rs4;
+; ENABLED-NEXT:    cvt.f32.f16 %r5, %rs3;
+; ENABLED-NEXT:    add.rn.f32 %r6, %r5, %r4;
+; ENABLED-NEXT:    cvt.rn.f16.f32 %rs7, %r6;
+; ENABLED-NEXT:    cvt.f32.f16 %r7, %rs7;
+; ENABLED-NEXT:    cvt.f32.f16 %r8, %rs6;
+; ENABLED-NEXT:    add.rn.f32 %r9, %r8, %r7;
+; ENABLED-NEXT:    cvt.rn.f16.f32 %rs8, %r9;
+; ENABLED-NEXT:    cvt.f32.f16 %r10, %rs8;
+; ENABLED-NEXT:    cvt.f32.f16 %r11, %rs5;
+; ENABLED-NEXT:    add.rn.f32 %r12, %r10, %r11;
+; ENABLED-NEXT:    cvt.rn.f16.f32 %rs9, %r12;
 ; ENABLED-NEXT:    st.param.b16 [func_retval0], %rs9;
 ; ENABLED-NEXT:    ret;
 ;
 ; DISABLED-LABEL: fh(
 ; DISABLED:       {
 ; DISABLED-NEXT:    .reg .b16 %rs<10>;
-; DISABLED-NEXT:    .reg .b32 %f<13>;
+; DISABLED-NEXT:    .reg .b32 %r<13>;
 ; DISABLED-NEXT:    .reg .b64 %rd<2>;
 ; DISABLED-EMPTY:
 ; DISABLED-NEXT:  // %bb.0:
@@ -84,22 +84,22 @@ define half @fh(ptr %p) {
 ; DISABLED-NEXT:    ld.b16 %rs3, [%rd1+4];
 ; DISABLED-NEXT:    ld.b16 %rs4, [%rd1+6];
 ; DISABLED-NEXT:    ld.b16 %rs5, [%rd1+8];
-; DISABLED-NEXT:    cvt.f32.f16 %f1, %rs2;
-; DISABLED-NEXT:    cvt.f32.f16 %f2, %rs1;
-; DISABLED-NEXT:    add.rn.f32 %f3, %f2, %f1;
-; DISABLED-NEXT:    cvt.rn.f16.f32 %rs6, %f3;
-; DISABLED-NEXT:    cvt.f32.f16 %f4, %rs4;
-; DISABLED-NEXT:    cvt.f32.f16 %f5, %rs3;
-; DISABLED-NEXT:    add.rn.f32 %f6, %f5, %f4;
-; DISABLED-NEXT:    cvt.rn.f16.f32 %rs7, %f6;
-; DISABLED-NEXT:    cvt.f32.f16 %f7, %rs7;
-; DISABLED-NEXT:    cvt.f32.f16 %f8, %rs6;
-; DISABLED-NEXT:    add.rn.f32 %f9, %f8, %f7;
-; DISABLED-NEXT:    cvt.rn.f16.f32 %rs8, %f9;
-; DISABLED-NEXT:    cvt.f32.f16 %f10, %rs8;
-; DISABLED-NEXT:    cvt.f32.f16 %f11, %rs5;
-; DISABLED-NEXT:    add.rn.f32 %f12, %f10, %f11;
-; DISABLED-NEXT:    cvt.rn.f16.f32 %rs9, %f12;
+; DISABLED-NEXT:    cvt.f32.f16 %r1, %rs2;
+; DISABLED-NEXT:    cvt.f32.f16 %r2, %rs1;
+; DISABLED-NEXT:    add.rn.f32 %r3, %r2, %r1;
+; DISABLED-NEXT:    cvt.rn.f16.f32 %rs6, %r3;
+; DISABLED-NEXT:    cvt.f32.f16 %r4, %rs4;
+; DISABLED-NEXT:    cvt.f32.f16 %r5, %rs3;
+; DISABLED-NEXT:    add.rn.f32 %r6, %r5, %r4;
+; DISABLED-NEXT:    cvt.rn.f16.f32 %rs7, %r6;
+; DISABLED-NEXT:    cvt.f32.f16 %r7, %rs7;
+; DISABLED-NEXT:    cvt.f32.f16 %r8, %rs6;
+; DISABLED-NEXT:    add.rn.f32 %r9, %r8, %r7;
+; DISABLED-NEXT:    cvt.rn.f16.f32 %rs8, %r9;
+; DISABLED-NEXT:    cvt.f32.f16 %r10, %rs8;
+; DISABLED-NEXT:    cvt.f32.f16 %r11, %rs5;
+; DISABLED-NEXT:    add.rn.f32 %r12, %r10, %r11;
+; DISABLED-NEXT:    cvt.rn.f16.f32 %rs9, %r12;
 ; DISABLED-NEXT:    st.param.b16 [func_retval0], %rs9;
 ; DISABLED-NEXT:    ret;
   %p.1 = getelementptr half, ptr %p, i32 1
@@ -121,37 +121,37 @@ define half @fh(ptr %p) {
 define float @ff(ptr %p) {
 ; ENABLED-LABEL: ff(
 ; ENABLED:       {
-; ENABLED-NEXT:    .reg .b32 %f<10>;
+; ENABLED-NEXT:    .reg .b32 %r<10>;
 ; ENABLED-NEXT:    .reg .b64 %rd<2>;
 ; ENABLED-EMPTY:
 ; ENABLED-NEXT:  // %bb.0:
 ; ENABLED-NEXT:    ld.param.b64 %rd1, [ff_param_0];
-; ENABLED-NEXT:    ld.v4.b32 {%f1, %f2, %f3, %f4}, [%rd1];
-; ENABLED-NEXT:    ld.b32 %f5, [%rd1+16];
-; ENABLED-NEXT:    add.rn.f32 %f6, %f1, %f2;
-; ENABLED-NEXT:    add.rn.f32 %f7, %f3, %f4;
-; ENABLED-NEXT:    add.rn.f32 %f8, %f6, %f7;
-; ENABLED-NEXT:    add.rn.f32 %f9, %f8, %f5;
-; ENABLED-NEXT:    st.param.b32 [func_retval0], %f9;
+; ENABLED-NEXT:    ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; ENABLED-NEXT:    ld.b32 %r5, [%rd1+16];
+; ENABLED-NEXT:    add.rn.f32 %r6, %r1, %r2;
+; ENABLED-NEXT:    add.rn.f32 %r7, %r3, %r4;
+; ENABLED-NEXT:    add.rn.f32 %r8, %r6, %r7;
+; ENABLED-NEXT:    add.rn.f32 %r9, %r8, %r5;
+; ENABLED-NEXT:    st.param.b32 [func_retval0], %r9;
 ; ENABLED-NEXT:    ret;
 ;
 ; DISABLED-LABEL: ff(
 ; DISABLED:       {
-; DISABLED-NEXT:    .reg .b32 %f<10>;
+; DISABLED-NEXT:    .reg .b32 %r<10>;
 ; DISABLED-NEXT:    .reg .b64 %rd<2>;
 ; DISABLED-EMPTY:
 ; DISABLED-NEXT:  // %bb.0:
 ; DISABLED-NEXT:    ld.param.b64 %rd1, [ff_param_0];
-; DISABLED-NEXT:    ld.b32 %f1, [%rd1];
-; DISABLED-NEXT:    ld.b32 %f2, [%rd1+4];
-; DISABLED-NEXT:    ld.b32 %f3, [%rd1+8];
-; DISABLED-NEXT:    ld.b32 %f4, [%rd1+12];
-; DISABLED-NEXT:    ld.b32 %f5, [%rd1+16];
-; DISABLED-NEXT:    add.rn.f32 %f6, %f1, %f2;
-; DISABLED-NEXT:    add.rn.f32 %f7, %f3, %f4;
-; DISABLED-NEXT:    add.rn.f32 %f8, %f6, %f7;
-; DISABLED-NEXT:    add.rn.f32 %f9, %f8, %f5;
-; DISABLED-NEXT:    st.param.b32 [func_retval0], %f9;
+; DISABLED-NEXT:    ld.b32 %r1, [%rd1];
+; DISABLED-NEXT:    ld.b32 %r2, [%rd1+4];
+; DISABLED-NEXT:    ld.b32 %r3, [%rd1+8];
+; DISABLED-NEXT:    ld.b32 %r4, [%rd1+12];
+; DISABLED-NEXT:    ld.b32 %r5, [%rd1+16];
+; DISABLED-NEXT:    add.rn.f32 %r6, %r1, %r2;
+; DISABLED-NEXT:    add.rn.f32 %r7, %r3, %r4;
+; DISABLED-NEXT:    add.rn.f32 %r8, %r6, %r7;
+; DISABLED-NEXT:    add.rn.f32 %r9, %r8, %r5;
+; DISABLED-NEXT:    st.param.b32 [func_retval0], %r9;
 ; DISABLED-NEXT:    ret;
   %p.1 = getelementptr float, ptr %p, i32 1
   %p.2 = getelementptr float, ptr %p, i32 2
diff --git a/llvm/test/CodeGen/NVPTX/access-non-generic.ll b/llvm/test/CodeGen/NVPTX/access-non-generic.ll
index a816f2e84b064..9edd4de017ee2 100644
--- a/llvm/test/CodeGen/NVPTX/access-non-generic.ll
+++ b/llvm/test/CodeGen/NVPTX/access-non-generic.ll
@@ -23,10 +23,10 @@ define void @ld_st_shared_f32(i32 %i, float %v) {
   ; load cast
   %1 = load float, ptr addrspacecast (ptr addrspace(3) @scalar to ptr), align 4
   call void @use(float %1)
-; PTX: ld.shared.b32 %f{{[0-9]+}}, [scalar];
+; PTX: ld.shared.b32 %r{{[0-9]+}}, [scalar];
   ; store cast
   store float %v, ptr addrspacecast (ptr addrspace(3) @scalar to ptr), align 4
-; PTX: st.shared.b32 [scalar], %f{{[0-9]+}};
+; PTX: st.shared.b32 [scalar], %r{{[0-9]+}};
   ; use syncthreads to disable optimizations across components
   call void @llvm.nvvm.barrier0()
 ; PTX: bar.sync 0;
@@ -35,20 +35,20 @@ define void @ld_st_shared_f32(i32 %i, float %v) {
   %2 = addrspacecast ptr addrspace(3) @scalar to ptr
   %3 = load float, ptr %2, align 4
   call void @use(float %3)
-; PTX: ld.shared.b32 %f{{[0-9]+}}, [scalar];
+; PTX: ld.shared.b32 %r{{[0-9]+}}, [scalar];
   ; cast; store
   store float %v, ptr %2, align 4
-; PTX: st.shared.b32 [scalar], %f{{[0-9]+}};
+; PTX: st.shared.b32 [scalar], %r{{[0-9]+}};
   call void @llvm.nvvm.barrier0()
 ; PTX: bar.sync 0;
 
   ; load gep cast
   %4 = load float, ptr getelementptr inbounds ([10 x float], ptr addrspacecast (ptr addrspace(3) @array to ptr), i32 0, i32 5), align 4
   call void @use(float %4)
-; PTX: ld.shared.b32 %f{{[0-9]+}}, [array+20];
+; PTX: ld.shared.b32 %r{{[0-9]+}}, [array+20];
   ; store gep cast
   store float %v, ptr getelementptr inbounds ([10 x float], ptr addrspacecast (ptr addrspace(3) @array to ptr), i32 0, i32 5), align 4
-; PTX: st.shared.b32 [array+20], %f{{[0-9]+}};
+; PTX: st.shared.b32 [array+20], %r{{[0-9]+}};
   call void @llvm.nvvm.barrier0()
 ; PTX: bar.sync 0;
 
@@ -56,10 +56,10 @@ define void @ld_st_shared_f32(i32 %i, float %v) {
   %5 = getelementptr inbounds [10 x float], ptr addrspacecast (ptr addrspace(3) @array to ptr), i32 0, i32 5
   %6 = load float, ptr %5, align 4
   call void @use(float %6)
-; PTX: ld.shared.b32 %f{{[0-9]+}}, [array+20];
+; PTX: ld.shared.b32 %r{{[0-9]+}}, [array+20];
   ; gep cast; store
   store float %v, ptr %5, align 4
-; PTX: st.shared.b32 [array+20], %f{{[0-9]+}};
+; PTX: st.shared.b32 [array+20], %r{{[0-9]+}};
   call void @llvm.nvvm.barrier0()
 ; PTX: bar.sync 0;
 
@@ -68,10 +68,10 @@ define void @ld_st_shared_f32(i32 %i, float %v) {
   %8 = getelementptr inbounds [10 x float], ptr %7, i32 0, i32 %i
   %9 = load float, ptr %8, align 4
   call void @use(float %9)
-; PTX: ld.shared.b32 %f{{[0-9]+}}, [%{{(r|rl|rd)[0-9]+}}];
+; PTX: ld.shared.b32 %r{{[0-9]+}}, [%{{(r|rl|rd)[0-9]+}}];
   ; cast; gep; store
   store float %v, ptr %8, align 4
-; PTX: st.shared.b32 [%{{(r|rl|rd)[0-9]+}}], %f{{[0-9]+}};
+; PTX: st.shared.b32 [%{{(r|rl|rd)[0-9]+}}], %r{{[0-9]+}};
   call void @llvm.nvvm.barrier0()
 ; PTX: bar.sync 0;
 
diff --git a/llvm/test/CodeGen/NVPTX/aggregate-return.ll b/llvm/test/CodeGen/NVPTX/aggregate-return.ll
index 72c302433f081..1c8f019922e37 100644
--- a/llvm/test/CodeGen/NVPTX/aggregate-return.ll
+++ b/llvm/test/CodeGen/NVPTX/aggregate-return.ll
@@ -10,7 +10,7 @@ define void @test_v2f32(<2 x float> %input, ptr %output) {
 ; CHECK-LABEL: @test_v2f32
   %call = tail call <2 x float> @barv(<2 x float> %input)
 ; CHECK: .param .align 8 .b8 retval0[8];
-; CHECK: ld.param.v2.b32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]]}, [retval0];
+; CHECK: ld.param.v2.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [retval0];
   store <2 x float> %call, ptr %output, align 8
 ; CHECK: st.v2.b32 [{{%rd[0-9]+}}], {[[E0]], [[E1]]}
   ret void
@@ -21,10 +21,10 @@ define void @test_v3f32(<3 x float> %input, ptr %output) {
 ;
   %call = tail call <3 x float> @barv3(<3 x float> %input)
 ; CHECK: .param .align 16 .b8 retval0[16];
-; CHECK-DAG: ld.param.v2.b32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]]}, [retval0];
-; CHECK-DAG: ld.param.b32 [[E2:%f[0-9]+]], [retval0+8];
+; CHECK-DAG: ld.param.v2.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [retval0];
+; CHECK-DAG: ld.param.b32 [[E2:%r[0-9]+]], [retval0+8];
 ; Make sure we don't load more values than than we need to.
-; CHECK-NOT: ld.param.b32 [[E3:%f[0-9]+]], [retval0+12];
+; CHECK-NOT: ld.param.b32 [[E3:%r[0-9]+]], [retval0+12];
   store <3 x float> %call, ptr %output, align 8
 ; CHECK-DAG: st.b32 [{{%rd[0-9]}}+8],
 ; -- This is suboptimal. We should do st.v2.f32 instead
@@ -38,8 +38,8 @@ define void @test_a2f32([2 x float] %input, ptr %output) {
 ; CHECK-LABEL: @test_a2f32
   %call = tail call [2 x float] @bara([2 x float] %input)
 ; CHECK: .param .align 4 .b8 retval0[8];
-; CHECK-DAG: ld.param.b32 [[ELEMA1:%f[0-9]+]], [retval0];
-; CHECK-DAG: ld.param.b32 [[ELEMA2:%f[0-9]+]], [retval0+4];
+; CHECK-DAG: ld.param.b32 [[ELEMA1:%r[0-9]+]], [retval0];
+; CHECK-DAG: ld.param.b32 [[ELEMA2:%r[0-9]+]], [retval0+4];
   store [2 x float] %call, ptr %output, align 4
 ; CHECK: }
 ; CHECK-DAG: st.b32 [{{%rd[0-9]+}}], [[ELEMA1]]
@@ -52,8 +52,8 @@ define void @test_s2f32({float, float} %input, ptr %output) {
 ; CHECK-LABEL: @test_s2f32
   %call = tail call {float, float} @bars({float, float} %input)
 ; CHECK: .param .align 4 .b8 retval0[8];
-; CHECK-DAG: ld.param.b32 [[ELEMS1:%f[0-9]+]], [retval0];
-; CHECK-DAG: ld.param.b32 [[ELEMS2:%f[0-9]+]], [retval0+4];
+; CHECK-DAG: ld.param.b32 [[ELEMS1:%r[0-9]+]], [retval0];
+; CHECK-DAG: ld.param.b32 [[ELEMS2:%r[0-9]+]], [retval0+4];
   store {float, float} %call, ptr %output, align 4
 ; CHECK: }
 ; CHECK-DAG: st.b32 [{{%rd[0-9]+}}], [[ELEMS1]]
diff --git a/llvm/test/CodeGen/NVPTX/and-or-setcc.ll b/llvm/test/CodeGen/NVPTX/and-or-setcc.ll
index 53c741bd6cb2c..b7e6e8b85298a 100644
--- a/llvm/test/CodeGen/NVPTX/and-or-setcc.ll
+++ b/llvm/test/CodeGen/NVPTX/and-or-setcc.ll
@@ -8,15 +8,14 @@ define i1 @and_ord(float %a, float %b) {
 ; CHECK-LABEL: and_ord(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<2>;
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [and_ord_param_0];
-; CHECK-NEXT:    ld.param.b32 %f2, [and_ord_param_1];
-; CHECK-NEXT:    setp.num.f32 %p1, %f1, %f2;
-; CHECK-NEXT:    selp.b32 %r1, 1, 0, %p1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ld.param.b32 %r1, [and_ord_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [and_ord_param_1];
+; CHECK-NEXT:    setp.num.f32 %p1, %r1, %r2;
+; CHECK-NEXT:    selp.b32 %r3, 1, 0, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
   %c = fcmp ord float %a, 0.0
   %d = fcmp ord float %b, 0.0
@@ -28,15 +27,14 @@ define i1 @or_uno(float %a, float %b) {
 ; CHECK-LABEL: or_uno(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<2>;
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [or_uno_param_0];
-; CHECK-NEXT:    ld.param.b32 %f2, [or_uno_param_1];
-; CHECK-NEXT:    setp.nan.f32 %p1, %f1, %f2;
-; CHECK-NEXT:    selp.b32 %r1, 1, 0, %p1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ld.param.b32 %r1, [or_uno_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [or_uno_param_1];
+; CHECK-NEXT:    setp.nan.f32 %p1, %r1, %r2;
+; CHECK-NEXT:    selp.b32 %r3, 1, 0, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
   %c = fcmp uno float %a, 0.0
   %d = fcmp uno float %b, 0.0
diff --git a/llvm/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll b/llvm/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll
index 33f76a882e9cd..ce71d3a78c0de 100644
--- a/llvm/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll
+++ b/llvm/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll
@@ -11,28 +11,28 @@
 ;;; f64
 
 define double @fadd_f64(double %a, double %b) {
-; CHECK: add.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}
+; CHECK: add.f64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}}
 ; CHECK: ret
   %ret = fadd double %a, %b
   ret double %ret
 }
 
 define double @fsub_f64(double %a, double %b) {
-; CHECK: sub.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}
+; CHECK: sub.f64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}}
 ; CHECK: ret
   %ret = fsub double %a, %b
   ret double %ret
 }
 
 define double @fmul_f64(double %a, double %b) {
-; CHECK: mul.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}
+; CHECK: mul.f64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}}
 ; CHECK: ret
   %ret = fmul double %a, %b
   ret double %ret
 }
 
 define double @fdiv_f64(double %a, double %b) {
-; CHECK: div.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}
+; CHECK: div.rn.f64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}}
 ; CHECK: ret
   %ret = fdiv double %a, %b
   ret double %ret
@@ -44,28 +44,28 @@ define double @fdiv_f64(double %a, double %b) {
 ;;; f32
 
 define float @fadd_f32(float %a, float %b) {
-; CHECK: add.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}
+; CHECK: add.f32 %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}
 ; CHECK: ret
   %ret = fadd float %a, %b
   ret float %ret
 }
 
 define float @fsub_f32(float %a, float %b) {
-; CHECK: sub.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}
+; CHECK: sub.f32 %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}
 ; CHECK: ret
   %ret = fsub float %a, %b
   ret float %ret
 }
 
 define float @fmul_f32(float %a, float %b) {
-; CHECK: mul.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}
+; CHECK: mul.f32 %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}
 ; CHECK: ret
   %ret = fmul float %a, %b
   ret float %ret
 }
 
 define float @fdiv_f32(float %a, float %b) {
-; CHECK: div.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}
+; CHECK: div.rn.f32 %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}
 ; CHECK: ret
   %ret = fdiv float %a, %b
   ret float %ret
diff --git a/llvm/test/CodeGen/NVPTX/atomics-with-scope.ll b/llvm/test/CodeGen/NVPTX/atomics-with-scope.ll
index f9a0c512b99b3..e6636d706b49d 100644
--- a/llvm/test/CodeGen/NVPTX/atomics-with-scope.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics-with-scope.ll
@@ -121,11 +121,11 @@ define void @test_atomics_scope_imm(ptr %fp, float %f,
 ; CHECK: atom.cta.add.u64{{.*}}, 2;
   %tmp2i = tail call i64 @llvm.nvvm.atomic.add.gen.i.cta.i64.p0(ptr %llp, i64 2)
 
-; CHECK: atom.cta.add.f32{{.*}}, %f{{[0-9]+}};
+; CHECK: atom.cta.add.f32{{.*}}, %r{{[0-9]+}};
   %tmp3r = tail call float @llvm.nvvm.atomic.add.gen.f.cta.f32.p0(ptr %fp, float %f)
 ; CHECK: atom.cta.add.f32{{.*}}, 0f40400000;
   %tmp3i = tail call float @llvm.nvvm.atomic.add.gen.f.cta.f32.p0(ptr %fp, float 3.0)
-; CHECK: atom.cta.add.f64{{.*}}, %fd{{[0-9]+}};
+; CHECK: atom.cta.add.f64{{.*}}, %rd{{[0-9]+}};
   %tmp4r = tail call double @llvm.nvvm.atomic.add.gen.f.cta.f64.p0(ptr %dfp, double %df)
 ; CHECK: atom.cta.add.f64{{.*}}, 0d4010000000000000;
   %tmp4i = tail call double @llvm.nvvm.atomic.add.gen.f.cta.f64.p0(ptr %dfp, double 4.0)
diff --git a/llvm/test/CodeGen/NVPTX/atomics.ll b/llvm/test/CodeGen/NVPTX/atomics.ll
index 6c5af3da5d9b2..c8dc34e9de2ca 100644
--- a/llvm/test/CodeGen/NVPTX/atomics.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics.ll
@@ -351,14 +351,14 @@ declare float @llvm.nvvm.atomic.load.add.f32.p0(ptr %addr, float %val)
 define float @atomic_add_f32_generic(ptr %addr, float %val) {
 ; CHECK-LABEL: atomic_add_f32_generic(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [atomic_add_f32_generic_param_0];
-; CHECK-NEXT:    ld.param.b32 %f1, [atomic_add_f32_generic_param_1];
-; CHECK-NEXT:    atom.add.f32 %f2, [%rd1], %f1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-NEXT:    ld.param.b32 %r1, [atomic_add_f32_generic_param_1];
+; CHECK-NEXT:    atom.add.f32 %r2, [%rd1], %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %ret = call float @llvm.nvvm.atomic.load.add.f32.p0(ptr %addr, float %val)
   ret float %ret
@@ -370,14 +370,14 @@ declare float @llvm.nvvm.atomic.load.add.f32.p1(ptr addrspace(1) %addr, float %v
 define float @atomic_add_f32_addrspace1(ptr addrspace(1) %addr, float %val) {
 ; CHECK-LABEL: atomic_add_f32_addrspace1(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [atomic_add_f32_addrspace1_param_0];
-; CHECK-NEXT:    ld.param.b32 %f1, [atomic_add_f32_addrspace1_param_1];
-; CHECK-NEXT:    atom.global.add.f32 %f2, [%rd1], %f1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-NEXT:    ld.param.b32 %r1, [atomic_add_f32_addrspace1_param_1];
+; CHECK-NEXT:    atom.global.add.f32 %r2, [%rd1], %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %ret = call float @llvm.nvvm.atomic.load.add.f32.p1(ptr addrspace(1) %addr, float %val)
   ret float %ret
@@ -389,14 +389,14 @@ declare float @llvm.nvvm.atomic.load.add.f32.p3(ptr addrspace(3) %addr, float %v
 define float @atomic_add_f32_addrspace3(ptr addrspace(3) %addr, float %val) {
 ; CHECK-LABEL: atomic_add_f32_addrspace3(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [atomic_add_f32_addrspace3_param_0];
-; CHECK-NEXT:    ld.param.b32 %f1, [atomic_add_f32_addrspace3_param_1];
-; CHECK-NEXT:    atom.shared.add.f32 %f2, [%rd1], %f1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-NEXT:    ld.param.b32 %r1, [atomic_add_f32_addrspace3_param_1];
+; CHECK-NEXT:    atom.shared.add.f32 %r2, [%rd1], %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %ret = call float @llvm.nvvm.atomic.load.add.f32.p3(ptr addrspace(3) %addr, float %val)
   ret float %ret
@@ -406,14 +406,14 @@ define float @atomic_add_f32_addrspace3(ptr addrspace(3) %addr, float %val) {
 define float @atomicrmw_add_f32_generic(ptr %addr, float %val) {
 ; CHECK-LABEL: atomicrmw_add_f32_generic(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [atomicrmw_add_f32_generic_param_0];
-; CHECK-NEXT:    ld.param.b32 %f1, [atomicrmw_add_f32_generic_param_1];
-; CHECK-NEXT:    atom.add.f32 %f2, [%rd1], %f1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-NEXT:    ld.param.b32 %r1, [atomicrmw_add_f32_generic_param_1];
+; CHECK-NEXT:    atom.add.f32 %r2, [%rd1], %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %ret = atomicrmw fadd ptr %addr, float %val seq_cst
   ret float %ret
@@ -425,8 +425,7 @@ define half @atomicrmw_add_f16_generic(ptr %addr, half %val) {
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<2>;
 ; CHECK-NEXT:    .reg .b16 %rs<5>;
-; CHECK-NEXT:    .reg .b32 %r<17>;
-; CHECK-NEXT:    .reg .b32 %f<4>;
+; CHECK-NEXT:    .reg .b32 %r<20>;
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
@@ -439,27 +438,27 @@ define half @atomicrmw_add_f16_generic(ptr %addr, half %val) {
 ; CHECK-NEXT:    mov.b32 %r8, 65535;
 ; CHECK-NEXT:    shl.b32 %r9, %r8, %r1;
 ; CHECK-NEXT:    not.b32 %r2, %r9;
-; CHECK-NEXT:    ld.b32 %r16, [%rd1];
-; CHECK-NEXT:    cvt.f32.f16 %f2, %rs1;
+; CHECK-NEXT:    ld.b32 %r19, [%rd1];
+; CHECK-NEXT:    cvt.f32.f16 %r12, %rs1;
 ; CHECK-NEXT:  $L__BB24_1: // %atomicrmw.start
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    shr.u32 %r10, %r16, %r1;
+; CHECK-NEXT:    shr.u32 %r10, %r19, %r1;
 ; CHECK-NEXT:    cvt.u16.u32 %rs2, %r10;
-; CHECK-NEXT:    cvt.f32.f16 %f1, %rs2;
-; CHECK-NEXT:    add.rn.f32 %f3, %f1, %f2;
-; CHECK-NEXT:    cvt.rn.f16.f32 %rs3, %f3;
-; CHECK-NEXT:    cvt.u32.u16 %r11, %rs3;
-; CHECK-NEXT:    shl.b32 %r12, %r11, %r1;
-; CHECK-NEXT:    and.b32 %r13, %r16, %r2;
-; CHECK-NEXT:    or.b32 %r14, %r13, %r12;
+; CHECK-NEXT:    cvt.f32.f16 %r11, %rs2;
+; CHECK-NEXT:    add.rn.f32 %r13, %r11, %r12;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs3, %r13;
+; CHECK-NEXT:    cvt.u32.u16 %r14, %rs3;
+; CHECK-NEXT:    shl.b32 %r15, %r14, %r1;
+; CHECK-NEXT:    and.b32 %r16, %r19, %r2;
+; CHECK-NEXT:    or.b32 %r17, %r16, %r15;
 ; CHECK-NEXT:    membar.sys;
-; CHECK-NEXT:    atom.cas.b32 %r5, [%rd1], %r16, %r14;
-; CHECK-NEXT:    setp.ne.s32 %p1, %r5, %r16;
-; CHECK-NEXT:    mov.b32 %r16, %r5;
+; CHECK-NEXT:    atom.cas.b32 %r5, [%rd1], %r19, %r17;
+; CHECK-NEXT:    setp.ne.s32 %p1, %r5, %r19;
+; CHECK-NEXT:    mov.b32 %r19, %r5;
 ; CHECK-NEXT:    @%p1 bra $L__BB24_1;
 ; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
-; CHECK-NEXT:    shr.u32 %r15, %r5, %r1;
-; CHECK-NEXT:    cvt.u16.u32 %rs4, %r15;
+; CHECK-NEXT:    shr.u32 %r18, %r5, %r1;
+; CHECK-NEXT:    cvt.u16.u32 %rs4, %r18;
 ; CHECK-NEXT:    st.param.b16 [func_retval0], %rs4;
 ; CHECK-NEXT:    ret;
   %ret = atomicrmw fadd ptr %addr, half %val seq_cst
@@ -470,14 +469,14 @@ define half @atomicrmw_add_f16_generic(ptr %addr, half %val) {
 define float @atomicrmw_add_f32_addrspace1(ptr addrspace(1) %addr, float %val) {
 ; CHECK-LABEL: atomicrmw_add_f32_addrspace1(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [atomicrmw_add_f32_addrspace1_param_0];
-; CHECK-NEXT:    ld.param.b32 %f1, [atomicrmw_add_f32_addrspace1_param_1];
-; CHECK-NEXT:    atom.global.add.f32 %f2, [%rd1], %f1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-NEXT:    ld.param.b32 %r1, [atomicrmw_add_f32_addrspace1_param_1];
+; CHECK-NEXT:    atom.global.add.f32 %r2, [%rd1], %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %ret = atomicrmw fadd ptr addrspace(1) %addr, float %val seq_cst
   ret float %ret
@@ -487,14 +486,14 @@ define float @atomicrmw_add_f32_addrspace1(ptr addrspace(1) %addr, float %val) {
 define float @atomicrmw_add_f32_addrspace3(ptr addrspace(3) %addr, float %val) {
 ; CHECK-LABEL: atomicrmw_add_f32_addrspace3(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [atomicrmw_add_f32_addrspace3_param_0];
-; CHECK-NEXT:    ld.param.b32 %f1, [atomicrmw_add_f32_addrspace3_param_1];
-; CHECK-NEXT:    atom.shared.add.f32 %f2, [%rd1], %f1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-NEXT:    ld.param.b32 %r1, [atomicrmw_add_f32_addrspace3_param_1];
+; CHECK-NEXT:    atom.shared.add.f32 %r2, [%rd1], %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %ret = atomicrmw fadd ptr addrspace(3) %addr, float %val seq_cst
   ret float %ret
diff --git a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
index 1ed191fcb9ff5..2854ea4b79302 100644
--- a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -18,25 +18,21 @@ define bfloat @test_fadd(bfloat %0, bfloat %1) {
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<2>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<11>;
-; SM70-NEXT:    .reg .b32 %f<4>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b16 %r1, [test_fadd_param_1];
-; SM70-NEXT:    shl.b32 %r2, %r1, 16;
-; SM70-NEXT:    mov.b32 %f1, %r2;
-; SM70-NEXT:    ld.param.b16 %r3, [test_fadd_param_0];
-; SM70-NEXT:    shl.b32 %r4, %r3, 16;
-; SM70-NEXT:    mov.b32 %f2, %r4;
-; SM70-NEXT:    add.rn.f32 %f3, %f2, %f1;
-; SM70-NEXT:    mov.b32 %r5, %f3;
-; SM70-NEXT:    bfe.u32 %r6, %r5, 16, 1;
-; SM70-NEXT:    add.s32 %r7, %r6, %r5;
-; SM70-NEXT:    add.s32 %r8, %r7, 32767;
-; SM70-NEXT:    setp.nan.f32 %p1, %f3, %f3;
-; SM70-NEXT:    or.b32 %r9, %r5, 4194304;
-; SM70-NEXT:    selp.b32 %r10, %r9, %r8, %p1;
-; SM70-NEXT:    mov.b32 {_, %rs1}, %r10;
+; SM70-NEXT:    shl.b32 %r14, %r1, 16;
+; SM70-NEXT:    ld.param.b16 %r4, [test_fadd_param_0];
+; SM70-NEXT:    shl.b32 %r15, %r4, 16;
+; SM70-NEXT:    add.rn.f32 %r16, %r15, %r14;
+; SM70-NEXT:    bfe.u32 %r9, %r16, 16, 1;
+; SM70-NEXT:    add.s32 %r10, %r9, %r16;
+; SM70-NEXT:    add.s32 %r11, %r10, 32767;
+; SM70-NEXT:    setp.nan.f32 %p1, %r16, %r16;
+; SM70-NEXT:    or.b32 %r12, %r16, 4194304;
+; SM70-NEXT:    selp.b32 %r13, %r12, %r11, %p1;
+; SM70-NEXT:    mov.b32 {_, %rs1}, %r13;
 ; SM70-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; SM70-NEXT:    ret;
 ;
@@ -55,15 +51,15 @@ define bfloat @test_fadd(bfloat %0, bfloat %1) {
 ; SM80-FTZ-LABEL: test_fadd(
 ; SM80-FTZ:       {
 ; SM80-FTZ-NEXT:    .reg .b16 %rs<4>;
-; SM80-FTZ-NEXT:    .reg .b32 %f<4>;
+; SM80-FTZ-NEXT:    .reg .b32 %r<4>;
 ; SM80-FTZ-EMPTY:
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.b16 %rs1, [test_fadd_param_0];
 ; SM80-FTZ-NEXT:    ld.param.b16 %rs2, [test_fadd_param_1];
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f1, %rs2;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f2, %rs1;
-; SM80-FTZ-NEXT:    add.rn.ftz.f32 %f3, %f2, %f1;
-; SM80-FTZ-NEXT:    cvt.rn.bf16.f32 %rs3, %f3;
+; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r1, %rs2;
+; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r2, %rs1;
+; SM80-FTZ-NEXT:    add.rn.ftz.f32 %r3, %r2, %r1;
+; SM80-FTZ-NEXT:    cvt.rn.bf16.f32 %rs3, %r3;
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs3;
 ; SM80-FTZ-NEXT:    ret;
 ;
@@ -86,25 +82,21 @@ define bfloat @test_fsub(bfloat %0, bfloat %1) {
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<2>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<11>;
-; SM70-NEXT:    .reg .b32 %f<4>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b16 %r1, [test_fsub_param_1];
-; SM70-NEXT:    shl.b32 %r2, %r1, 16;
-; SM70-NEXT:    mov.b32 %f1, %r2;
-; SM70-NEXT:    ld.param.b16 %r3, [test_fsub_param_0];
-; SM70-NEXT:    shl.b32 %r4, %r3, 16;
-; SM70-NEXT:    mov.b32 %f2, %r4;
-; SM70-NEXT:    sub.rn.f32 %f3, %f2, %f1;
-; SM70-NEXT:    mov.b32 %r5, %f3;
-; SM70-NEXT:    bfe.u32 %r6, %r5, 16, 1;
-; SM70-NEXT:    add.s32 %r7, %r6, %r5;
-; SM70-NEXT:    add.s32 %r8, %r7, 32767;
-; SM70-NEXT:    setp.nan.f32 %p1, %f3, %f3;
-; SM70-NEXT:    or.b32 %r9, %r5, 4194304;
-; SM70-NEXT:    selp.b32 %r10, %r9, %r8, %p1;
-; SM70-NEXT:    mov.b32 {_, %rs1}, %r10;
+; SM70-NEXT:    shl.b32 %r14, %r1, 16;
+; SM70-NEXT:    ld.param.b16 %r4, [test_fsub_param_0];
+; SM70-NEXT:    shl.b32 %r15, %r4, 16;
+; SM70-NEXT:    sub.rn.f32 %r16, %r15, %r14;
+; SM70-NEXT:    bfe.u32 %r9, %r16, 16, 1;
+; SM70-NEXT:    add.s32 %r10, %r9, %r16;
+; SM70-NEXT:    add.s32 %r11, %r10, 32767;
+; SM70-NEXT:    setp.nan.f32 %p1, %r16, %r16;
+; SM70-NEXT:    or.b32 %r12, %r16, 4194304;
+; SM70-NEXT:    selp.b32 %r13, %r12, %r11, %p1;
+; SM70-NEXT:    mov.b32 {_, %rs1}, %r13;
 ; SM70-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; SM70-NEXT:    ret;
 ;
@@ -123,15 +115,15 @@ define bfloat @test_fsub(bfloat %0, bfloat %1) {
 ; SM80-FTZ-LABEL: test_fsub(
 ; SM80-FTZ:       {
 ; SM80-FTZ-NEXT:    .reg .b16 %rs<4>;
-; SM80-FTZ-NEXT:    .reg .b32 %f<4>;
+; SM80-FTZ-NEXT:    .reg .b32 %r<4>;
 ; SM80-FTZ-EMPTY:
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.b16 %rs1, [test_fsub_param_0];
 ; SM80-FTZ-NEXT:    ld.param.b16 %rs2, [test_fsub_param_1];
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f1, %rs2;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f2, %rs1;
-; SM80-FTZ-NEXT:    sub.rn.ftz.f32 %f3, %f2, %f1;
-; SM80-FTZ-NEXT:    cvt.rn.bf16.f32 %rs3, %f3;
+; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r1, %rs2;
+; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r2, %rs1;
+; SM80-FTZ-NEXT:    sub.rn.ftz.f32 %r3, %r2, %r1;
+; SM80-FTZ-NEXT:    cvt.rn.bf16.f32 %rs3, %r3;
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs3;
 ; SM80-FTZ-NEXT:    ret;
 ;
@@ -154,44 +146,37 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<5>;
-; SM70-NEXT:    .reg .b32 %r<24>;
-; SM70-NEXT:    .reg .b32 %f<7>;
+; SM70-NEXT:    .reg .b32 %r<36>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b32 %r1, [test_faddx2_param_0];
 ; SM70-NEXT:    ld.param.b32 %r2, [test_faddx2_param_1];
 ; SM70-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
 ; SM70-NEXT:    cvt.u32.u16 %r3, %rs2;
-; SM70-NEXT:    shl.b32 %r4, %r3, 16;
-; SM70-NEXT:    mov.b32 %f1, %r4;
+; SM70-NEXT:    shl.b32 %r30, %r3, 16;
 ; SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; SM70-NEXT:    cvt.u32.u16 %r5, %rs4;
-; SM70-NEXT:    shl.b32 %r6, %r5, 16;
-; SM70-NEXT:    mov.b32 %f2, %r6;
-; SM70-NEXT:    add.rn.f32 %f3, %f2, %f1;
-; SM70-NEXT:    mov.b32 %r7, %f3;
-; SM70-NEXT:    bfe.u32 %r8, %r7, 16, 1;
-; SM70-NEXT:    add.s32 %r9, %r8, %r7;
-; SM70-NEXT:    add.s32 %r10, %r9, 32767;
-; SM70-NEXT:    setp.nan.f32 %p1, %f3, %f3;
-; SM70-NEXT:    or.b32 %r11, %r7, 4194304;
-; SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    shl.b32 %r14, %r13, 16;
-; SM70-NEXT:    mov.b32 %f4, %r14;
-; SM70-NEXT:    cvt.u32.u16 %r15, %rs3;
-; SM70-NEXT:    shl.b32 %r16, %r15, 16;
-; SM70-NEXT:    mov.b32 %f5, %r16;
-; SM70-NEXT:    add.rn.f32 %f6, %f5, %f4;
-; SM70-NEXT:    mov.b32 %r17, %f6;
-; SM70-NEXT:    bfe.u32 %r18, %r17, 16, 1;
-; SM70-NEXT:    add.s32 %r19, %r18, %r17;
-; SM70-NEXT:    add.s32 %r20, %r19, 32767;
-; SM70-NEXT:    setp.nan.f32 %p2, %f6, %f6;
-; SM70-NEXT:    or.b32 %r21, %r17, 4194304;
-; SM70-NEXT:    selp.b32 %r22, %r21, %r20, %p2;
-; SM70-NEXT:    prmt.b32 %r23, %r22, %r12, 0x7632U;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r23;
+; SM70-NEXT:    cvt.u32.u16 %r6, %rs4;
+; SM70-NEXT:    shl.b32 %r31, %r6, 16;
+; SM70-NEXT:    add.rn.f32 %r32, %r31, %r30;
+; SM70-NEXT:    bfe.u32 %r11, %r32, 16, 1;
+; SM70-NEXT:    add.s32 %r12, %r11, %r32;
+; SM70-NEXT:    add.s32 %r13, %r12, 32767;
+; SM70-NEXT:    setp.nan.f32 %p1, %r32, %r32;
+; SM70-NEXT:    or.b32 %r14, %r32, 4194304;
+; SM70-NEXT:    selp.b32 %r15, %r14, %r13, %p1;
+; SM70-NEXT:    cvt.u32.u16 %r16, %rs1;
+; SM70-NEXT:    shl.b32 %r33, %r16, 16;
+; SM70-NEXT:    cvt.u32.u16 %r19, %rs3;
+; SM70-NEXT:    shl.b32 %r34, %r19, 16;
+; SM70-NEXT:    add.rn.f32 %r35, %r34, %r33;
+; SM70-NEXT:    bfe.u32 %r24, %r35, 16, 1;
+; SM70-NEXT:    add.s32 %r25, %r24, %r35;
+; SM70-NEXT:    add.s32 %r26, %r25, 32767;
+; SM70-NEXT:    setp.nan.f32 %p2, %r35, %r35;
+; SM70-NEXT:    or.b32 %r27, %r35, 4194304;
+; SM70-NEXT:    selp.b32 %r28, %r27, %r26, %p2;
+; SM70-NEXT:    prmt.b32 %r29, %r28, %r15, 0x7632U;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r29;
 ; SM70-NEXT:    ret;
 ;
 ; SM80-LABEL: test_faddx2(
@@ -209,22 +194,21 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; SM80-FTZ-LABEL: test_faddx2(
 ; SM80-FTZ:       {
 ; SM80-FTZ-NEXT:    .reg .b16 %rs<5>;
-; SM80-FTZ-NEXT:    .reg .b32 %r<4>;
-; SM80-FTZ-NEXT:    .reg .b32 %f<7>;
+; SM80-FTZ-NEXT:    .reg .b32 %r<10>;
 ; SM80-FTZ-EMPTY:
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.b32 %r1, [test_faddx2_param_0];
 ; SM80-FTZ-NEXT:    ld.param.b32 %r2, [test_faddx2_param_1];
 ; SM80-FTZ-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f1, %rs1;
+; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r3, %rs1;
 ; SM80-FTZ-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f2, %rs3;
-; SM80-FTZ-NEXT:    add.rn.ftz.f32 %f3, %f2, %f1;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f4, %rs2;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f5, %rs4;
-; SM80-FTZ-NEXT:    add.rn.ftz.f32 %f6, %f5, %f4;
-; SM80-FTZ-NEXT:    cvt.rn.bf16x2.f32 %r3, %f6, %f3;
-; SM80-FTZ-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r4, %rs3;
+; SM80-FTZ-NEXT:    add.rn.ftz.f32 %r5, %r4, %r3;
+; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r6, %rs2;
+; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r7, %rs4;
+; SM80-FTZ-NEXT:    add.rn.ftz.f32 %r8, %r7, %r6;
+; SM80-FTZ-NEXT:    cvt.rn.bf16x2.f32 %r9, %r8, %r5;
+; SM80-FTZ-NEXT:    st.param.b32 [func_retval0], %r9;
 ; SM80-FTZ-NEXT:    ret;
 ;
 ; SM90-LABEL: test_faddx2(
@@ -246,44 +230,37 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<5>;
-; SM70-NEXT:    .reg .b32 %r<24>;
-; SM70-NEXT:    .reg .b32 %f<7>;
+; SM70-NEXT:    .reg .b32 %r<36>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b32 %r1, [test_fsubx2_param_0];
 ; SM70-NEXT:    ld.param.b32 %r2, [test_fsubx2_param_1];
 ; SM70-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
 ; SM70-NEXT:    cvt.u32.u16 %r3, %rs2;
-; SM70-NEXT:    shl.b32 %r4, %r3, 16;
-; SM70-NEXT:    mov.b32 %f1, %r4;
+; SM70-NEXT:    shl.b32 %r30, %r3, 16;
 ; SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; SM70-NEXT:    cvt.u32.u16 %r5, %rs4;
-; SM70-NEXT:    shl.b32 %r6, %r5, 16;
-; SM70-NEXT:    mov.b32 %f2, %r6;
-; SM70-NEXT:    sub.rn.f32 %f3, %f2, %f1;
-; SM70-NEXT:    mov.b32 %r7, %f3;
-; SM70-NEXT:    bfe.u32 %r8, %r7, 16, 1;
-; SM70-NEXT:    add.s32 %r9, %r8, %r7;
-; SM70-NEXT:    add.s32 %r10, %r9, 32767;
-; SM70-NEXT:    setp.nan.f32 %p1, %f3, %f3;
-; SM70-NEXT:    or.b32 %r11, %r7, 4194304;
-; SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    shl.b32 %r14, %r13, 16;
-; SM70-NEXT:    mov.b32 %f4, %r14;
-; SM70-NEXT:    cvt.u32.u16 %r15, %rs3;
-; SM70-NEXT:    shl.b32 %r16, %r15, 16;
-; SM70-NEXT:    mov.b32 %f5, %r16;
-; SM70-NEXT:    sub.rn.f32 %f6, %f5, %f4;
-; SM70-NEXT:    mov.b32 %r17, %f6;
-; SM70-NEXT:    bfe.u32 %r18, %r17, 16, 1;
-; SM70-NEXT:    add.s32 %r19, %r18, %r17;
-; SM70-NEXT:    add.s32 %r20, %r19, 32767;
-; SM70-NEXT:    setp.nan.f32 %p2, %f6, %f6;
-; SM70-NEXT:    or.b32 %r21, %r17, 4194304;
-; SM70-NEXT:    selp.b32 %r22, %r21, %r20, %p2;
-; SM70-NEXT:    prmt.b32 %r23, %r22, %r12, 0x7632U;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r23;
+; SM70-NEXT:    cvt.u32.u16 %r6, %rs4;
+; SM70-NEXT:    shl.b32 %r31, %r6, 16;
+; SM70-NEXT:    sub.rn.f32 %r32, %r31, %r30;
+; SM70-NEXT:    bfe.u32 %r11, %r32, 16, 1;
+; SM70-NEXT:    add.s32 %r12, %r11, %r32;
+; SM70-NEXT:    add.s32 %r13, %r12, 32767;
+; SM70-NEXT:    setp.nan.f32 %p1, %r32, %r32;
+; SM70-NEXT:    or.b32 %r14, %r32, 4194304;
+; SM70-NEXT:    selp.b32 %r15, %r14, %r13, %p1;
+; SM70-NEXT:    cvt.u32.u16 %r16, %rs1;
+; SM70-NEXT:    shl.b32 %r33, %r16, 16;
+; SM70-NEXT:    cvt.u32.u16 %r19, %rs3;
+; SM70-NEXT:    shl.b32 %r34, %r19, 16;
+; SM70-NEXT:    sub.rn.f32 %r35, %r34, %r33;
+; SM70-NEXT:    bfe.u32 %r24, %r35, 16, 1;
+; SM70-NEXT:    add.s32 %r25, %r24, %r35;
+; SM70-NEXT:    add.s32 %r26, %r25, 32767;
+; SM70-NEXT:    setp.nan.f32 %p2, %r35, %r35;
+; SM70-NEXT:    or.b32 %r27, %r35, 4194304;
+; SM70-NEXT:    selp.b32 %r28, %r27, %r26, %p2;
+; SM70-NEXT:    prmt.b32 %r29, %r28, %r15, 0x7632U;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r29;
 ; SM70-NEXT:    ret;
 ;
 ; SM80-LABEL: test_fsubx2(
@@ -301,22 +278,21 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; SM80-FTZ-LABEL: test_fsubx2(
 ; SM80-FTZ:       {
 ; SM80-FTZ-NEXT:    .reg .b16 %rs<5>;
-; SM80-FTZ-NEXT:    .reg .b32 %r<4>;
-; SM80-FTZ-NEXT:    .reg .b32 %f<7>;
+; SM80-FTZ-NEXT:    .reg .b32 %r<10>;
 ; SM80-FTZ-EMPTY:
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.b32 %r1, [test_fsubx2_param_0];
 ; SM80-FTZ-NEXT:    ld.param.b32 %r2, [test_fsubx2_param_1];
 ; SM80-FTZ-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f1, %rs1;
+; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r3, %rs1;
 ; SM80-FTZ-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f2, %rs3;
-; SM80-FTZ-NEXT:    sub.rn.ftz.f32 %f3, %f2, %f1;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f4, %rs2;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f5, %rs4;
-; SM80-FTZ-NEXT:    sub.rn.ftz.f32 %f6, %f5, %f4;
-; SM80-FTZ-NEXT:    cvt.rn.bf16x2.f32 %r3, %f6, %f3;
-; SM80-FTZ-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r4, %rs3;
+; SM80-FTZ-NEXT:    sub.rn.ftz.f32 %r5, %r4, %r3;
+; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r6, %rs2;
+; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r7, %rs4;
+; SM80-FTZ-NEXT:    sub.rn.ftz.f32 %r8, %r7, %r6;
+; SM80-FTZ-NEXT:    cvt.rn.bf16x2.f32 %r9, %r8, %r5;
+; SM80-FTZ-NEXT:    st.param.b32 [func_retval0], %r9;
 ; SM80-FTZ-NEXT:    ret;
 ;
 ; SM90-LABEL: test_fsubx2(
@@ -338,44 +314,37 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<5>;
-; SM70-NEXT:    .reg .b32 %r<24>;
-; SM70-NEXT:    .reg .b32 %f<7>;
+; SM70-NEXT:    .reg .b32 %r<36>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b32 %r1, [test_fmulx2_param_0];
 ; SM70-NEXT:    ld.param.b32 %r2, [test_fmulx2_param_1];
 ; SM70-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
 ; SM70-NEXT:    cvt.u32.u16 %r3, %rs2;
-; SM70-NEXT:    shl.b32 %r4, %r3, 16;
-; SM70-NEXT:    mov.b32 %f1, %r4;
+; SM70-NEXT:    shl.b32 %r30, %r3, 16;
 ; SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; SM70-NEXT:    cvt.u32.u16 %r5, %rs4;
-; SM70-NEXT:    shl.b32 %r6, %r5, 16;
-; SM70-NEXT:    mov.b32 %f2, %r6;
-; SM70-NEXT:    mul.rn.f32 %f3, %f2, %f1;
-; SM70-NEXT:    mov.b32 %r7, %f3;
-; SM70-NEXT:    bfe.u32 %r8, %r7, 16, 1;
-; SM70-NEXT:    add.s32 %r9, %r8, %r7;
-; SM70-NEXT:    add.s32 %r10, %r9, 32767;
-; SM70-NEXT:    setp.nan.f32 %p1, %f3, %f3;
-; SM70-NEXT:    or.b32 %r11, %r7, 4194304;
-; SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    shl.b32 %r14, %r13, 16;
-; SM70-NEXT:    mov.b32 %f4, %r14;
-; SM70-NEXT:    cvt.u32.u16 %r15, %rs3;
-; SM70-NEXT:    shl.b32 %r16, %r15, 16;
-; SM70-NEXT:    mov.b32 %f5, %r16;
-; SM70-NEXT:    mul.rn.f32 %f6, %f5, %f4;
-; SM70-NEXT:    mov.b32 %r17, %f6;
-; SM70-NEXT:    bfe.u32 %r18, %r17, 16, 1;
-; SM70-NEXT:    add.s32 %r19, %r18, %r17;
-; SM70-NEXT:    add.s32 %r20, %r19, 32767;
-; SM70-NEXT:    setp.nan.f32 %p2, %f6, %f6;
-; SM70-NEXT:    or.b32 %r21, %r17, 4194304;
-; SM70-NEXT:    selp.b32 %r22, %r21, %r20, %p2;
-; SM70-NEXT:    prmt.b32 %r23, %r22, %r12, 0x7632U;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r23;
+; SM70-NEXT:    cvt.u32.u16 %r6, %rs4;
+; SM70-NEXT:    shl.b32 %r31, %r6, 16;
+; SM70-NEXT:    mul.rn.f32 %r32, %r31, %r30;
+; SM70-NEXT:    bfe.u32 %r11, %r32, 16, 1;
+; SM70-NEXT:    add.s32 %r12, %r11, %r32;
+; SM70-NEXT:    add.s32 %r13, %r12, 32767;
+; SM70-NEXT:    setp.nan.f32 %p1, %r32, %r32;
+; SM70-NEXT:    or.b32 %r14, %r32, 4194304;
+; SM70-NEXT:    selp.b32 %r15, %r14, %r13, %p1;
+; SM70-NEXT:    cvt.u32.u16 %r16, %rs1;
+; SM70-NEXT:    shl.b32 %r33, %r16, 16;
+; SM70-NEXT:    cvt.u32.u16 %r19, %rs3;
+; SM70-NEXT:    shl.b32 %r34, %r19, 16;
+; SM70-NEXT:    mul.rn.f32 %r35, %r34, %r33;
+; SM70-NEXT:    bfe.u32 %r24, %r35, 16, 1;
+; SM70-NEXT:    add.s32 %r25, %r24, %r35;
+; SM70-NEXT:    add.s32 %r26, %r25, 32767;
+; SM70-NEXT:    setp.nan.f32 %p2, %r35, %r35;
+; SM70-NEXT:    or.b32 %r27, %r35, 4194304;
+; SM70-NEXT:    selp.b32 %r28, %r27, %r26, %p2;
+; SM70-NEXT:    prmt.b32 %r29, %r28, %r15, 0x7632U;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r29;
 ; SM70-NEXT:    ret;
 ;
 ; SM80-LABEL: test_fmulx2(
@@ -393,22 +362,21 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; SM80-FTZ-LABEL: test_fmulx2(
 ; SM80-FTZ:       {
 ; SM80-FTZ-NEXT:    .reg .b16 %rs<5>;
-; SM80-FTZ-NEXT:    .reg .b32 %r<4>;
-; SM80-FTZ-NEXT:    .reg .b32 %f<7>;
+; SM80-FTZ-NEXT:    .reg .b32 %r<10>;
 ; SM80-FTZ-EMPTY:
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.b32 %r1, [test_fmulx2_param_0];
 ; SM80-FTZ-NEXT:    ld.param.b32 %r2, [test_fmulx2_param_1];
 ; SM80-FTZ-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f1, %rs1;
+; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r3, %rs1;
 ; SM80-FTZ-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f2, %rs3;
-; SM80-FTZ-NEXT:    mul.rn.ftz.f32 %f3, %f2, %f1;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f4, %rs2;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f5, %rs4;
-; SM80-FTZ-NEXT:    mul.rn.ftz.f32 %f6, %f5, %f4;
-; SM80-FTZ-NEXT:    cvt.rn.bf16x2.f32 %r3, %f6, %f3;
-; SM80-FTZ-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r4, %rs3;
+; SM80-FTZ-NEXT:    mul.rn.ftz.f32 %r5, %r4, %r3;
+; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r6, %rs2;
+; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r7, %rs4;
+; SM80-FTZ-NEXT:    mul.rn.ftz.f32 %r8, %r7, %r6;
+; SM80-FTZ-NEXT:    cvt.rn.bf16x2.f32 %r9, %r8, %r5;
+; SM80-FTZ-NEXT:    st.param.b32 [func_retval0], %r9;
 ; SM80-FTZ-NEXT:    ret;
 ;
 ; SM90-LABEL: test_fmulx2(
@@ -430,107 +398,97 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<5>;
-; SM70-NEXT:    .reg .b32 %r<24>;
-; SM70-NEXT:    .reg .b32 %f<7>;
+; SM70-NEXT:    .reg .b32 %r<36>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b32 %r1, [test_fdiv_param_0];
 ; SM70-NEXT:    ld.param.b32 %r2, [test_fdiv_param_1];
 ; SM70-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
 ; SM70-NEXT:    cvt.u32.u16 %r3, %rs2;
-; SM70-NEXT:    shl.b32 %r4, %r3, 16;
-; SM70-NEXT:    mov.b32 %f1, %r4;
+; SM70-NEXT:    shl.b32 %r30, %r3, 16;
 ; SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; SM70-NEXT:    cvt.u32.u16 %r5, %rs4;
-; SM70-NEXT:    shl.b32 %r6, %r5, 16;
-; SM70-NEXT:    mov.b32 %f2, %r6;
-; SM70-NEXT:    div.rn.f32 %f3, %f2, %f1;
-; SM70-NEXT:    mov.b32 %r7, %f3;
-; SM70-NEXT:    bfe.u32 %r8, %r7, 16, 1;
-; SM70-NEXT:    add.s32 %r9, %r8, %r7;
-; SM70-NEXT:    add.s32 %r10, %r9, 32767;
-; SM70-NEXT:    setp.nan.f32 %p1, %f3, %f3;
-; SM70-NEXT:    or.b32 %r11, %r7, 4194304;
-; SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    shl.b32 %r14, %r13, 16;
-; SM70-NEXT:    mov.b32 %f4, %r14;
-; SM70-NEXT:    cvt.u32.u16 %r15, %rs3;
-; SM70-NEXT:    shl.b32 %r16, %r15, 16;
-; SM70-NEXT:    mov.b32 %f5, %r16;
-; SM70-NEXT:    div.rn.f32 %f6, %f5, %f4;
-; SM70-NEXT:    mov.b32 %r17, %f6;
-; SM70-NEXT:    bfe.u32 %r18, %r17, 16, 1;
-; SM70-NEXT:    add.s32 %r19, %r18, %r17;
-; SM70-NEXT:    add.s32 %r20, %r19, 32767;
-; SM70-NEXT:    setp.nan.f32 %p2, %f6, %f6;
-; SM70-NEXT:    or.b32 %r21, %r17, 4194304;
-; SM70-NEXT:    selp.b32 %r22, %r21, %r20, %p2;
-; SM70-NEXT:    prmt.b32 %r23, %r22, %r12, 0x7632U;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r23;
+; SM70-NEXT:    cvt.u32.u16 %r6, %rs4;
+; SM70-NEXT:    shl.b32 %r31, %r6, 16;
+; SM70-NEXT:    div.rn.f32 %r32, %r31, %r30;
+; SM70-NEXT:    bfe.u32 %r11, %r32, 16, 1;
+; SM70-NEXT:    add.s32 %r12, %r11, %r32;
+; SM70-NEXT:    add.s32 %r13, %r12, 32767;
+; SM70-NEXT:    setp.nan.f32 %p1, %r32, %r32;
+; SM70-NEXT:    or.b32 %r14, %r32, 4194304;
+; SM70-NEXT:    selp.b32 %r15, %r14, %r13, %p1;
+; SM70-NEXT:    cvt.u32.u16 %r16, %rs1;
+; SM70-NEXT:    shl.b32 %r33, %r16, 16;
+; SM70-NEXT:    cvt.u32.u16 %r19, %rs3;
+; SM70-NEXT:    shl.b32 %r34, %r19, 16;
+; SM70-NEXT:    div.rn.f32 %r35, %r34, %r33;
+; SM70-NEXT:    bfe.u32 %r24, %r35, 16, 1;
+; SM70-NEXT:    add.s32 %r25, %r24, %r35;
+; SM70-NEXT:    add.s32 %r26, %r25, 32767;
+; SM70-NEXT:    setp.nan.f32 %p2, %r35, %r35;
+; SM70-NEXT:    or.b32 %r27, %r35, 4194304;
+; SM70-NEXT:    selp.b32 %r28, %r27, %r26, %p2;
+; SM70-NEXT:    prmt.b32 %r29, %r28, %r15, 0x7632U;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r29;
 ; SM70-NEXT:    ret;
 ;
 ; SM80-LABEL: test_fdiv(
 ; SM80:       {
 ; SM80-NEXT:    .reg .b16 %rs<5>;
-; SM80-NEXT:    .reg .b32 %r<4>;
-; SM80-NEXT:    .reg .b32 %f<7>;
+; SM80-NEXT:    .reg .b32 %r<10>;
 ; SM80-EMPTY:
 ; SM80-NEXT:  // %bb.0:
 ; SM80-NEXT:    ld.param.b32 %r1, [test_fdiv_param_0];
 ; SM80-NEXT:    ld.param.b32 %r2, [test_fdiv_param_1];
 ; SM80-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; SM80-NEXT:    cvt.f32.bf16 %f1, %rs1;
+; SM80-NEXT:    cvt.f32.bf16 %r3, %rs1;
 ; SM80-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; SM80-NEXT:    cvt.f32.bf16 %f2, %rs3;
-; SM80-NEXT:    div.rn.f32 %f3, %f2, %f1;
-; SM80-NEXT:    cvt.f32.bf16 %f4, %rs2;
-; SM80-NEXT:    cvt.f32.bf16 %f5, %rs4;
-; SM80-NEXT:    div.rn.f32 %f6, %f5, %f4;
-; SM80-NEXT:    cvt.rn.bf16x2.f32 %r3, %f6, %f3;
-; SM80-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM80-NEXT:    cvt.f32.bf16 %r4, %rs3;
+; SM80-NEXT:    div.rn.f32 %r5, %r4, %r3;
+; SM80-NEXT:    cvt.f32.bf16 %r6, %rs2;
+; SM80-NEXT:    cvt.f32.bf16 %r7, %rs4;
+; SM80-NEXT:    div.rn.f32 %r8, %r7, %r6;
+; SM80-NEXT:    cvt.rn.bf16x2.f32 %r9, %r8, %r5;
+; SM80-NEXT:    st.param.b32 [func_retval0], %r9;
 ; SM80-NEXT:    ret;
 ;
 ; SM80-FTZ-LABEL: test_fdiv(
 ; SM80-FTZ:       {
 ; SM80-FTZ-NEXT:    .reg .b16 %rs<5>;
-; SM80-FTZ-NEXT:    .reg .b32 %r<4>;
-; SM80-FTZ-NEXT:    .reg .b32 %f<7>;
+; SM80-FTZ-NEXT:    .reg .b32 %r<10>;
 ; SM80-FTZ-EMPTY:
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.b32 %r1, [test_fdiv_param_0];
 ; SM80-FTZ-NEXT:    ld.param.b32 %r2, [test_fdiv_param_1];
 ; SM80-FTZ-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f1, %rs1;
+; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r3, %rs1;
 ; SM80-FTZ-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f2, %rs3;
-; SM80-FTZ-NEXT:    div.rn.ftz.f32 %f3, %f2, %f1;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f4, %rs2;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f5, %rs4;
-; SM80-FTZ-NEXT:    div.rn.ftz.f32 %f6, %f5, %f4;
-; SM80-FTZ-NEXT:    cvt.rn.bf16x2.f32 %r3, %f6, %f3;
-; SM80-FTZ-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r4, %rs3;
+; SM80-FTZ-NEXT:    div.rn.ftz.f32 %r5, %r4, %r3;
+; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r6, %rs2;
+; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r7, %rs4;
+; SM80-FTZ-NEXT:    div.rn.ftz.f32 %r8, %r7, %r6;
+; SM80-FTZ-NEXT:    cvt.rn.bf16x2.f32 %r9, %r8, %r5;
+; SM80-FTZ-NEXT:    st.param.b32 [func_retval0], %r9;
 ; SM80-FTZ-NEXT:    ret;
 ;
 ; SM90-LABEL: test_fdiv(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<5>;
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b32 %f<7>;
+; SM90-NEXT:    .reg .b32 %r<10>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
 ; SM90-NEXT:    ld.param.b32 %r1, [test_fdiv_param_0];
 ; SM90-NEXT:    ld.param.b32 %r2, [test_fdiv_param_1];
 ; SM90-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; SM90-NEXT:    cvt.f32.bf16 %f1, %rs1;
+; SM90-NEXT:    cvt.f32.bf16 %r3, %rs1;
 ; SM90-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; SM90-NEXT:    cvt.f32.bf16 %f2, %rs3;
-; SM90-NEXT:    div.rn.f32 %f3, %f2, %f1;
-; SM90-NEXT:    cvt.f32.bf16 %f4, %rs2;
-; SM90-NEXT:    cvt.f32.bf16 %f5, %rs4;
-; SM90-NEXT:    div.rn.f32 %f6, %f5, %f4;
-; SM90-NEXT:    cvt.rn.bf16x2.f32 %r3, %f6, %f3;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM90-NEXT:    cvt.f32.bf16 %r4, %rs3;
+; SM90-NEXT:    div.rn.f32 %r5, %r4, %r3;
+; SM90-NEXT:    cvt.f32.bf16 %r6, %rs2;
+; SM90-NEXT:    cvt.f32.bf16 %r7, %rs4;
+; SM90-NEXT:    div.rn.f32 %r8, %r7, %r6;
+; SM90-NEXT:    cvt.rn.bf16x2.f32 %r9, %r8, %r5;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r9;
 ; SM90-NEXT:    ret;
   %r = fdiv <2 x bfloat> %a, %b
   ret <2 x bfloat> %r
@@ -565,47 +523,45 @@ define bfloat @test_extract_1(<2 x bfloat> %a) #0 {
 define float @test_fpext_float(bfloat %a) #0 {
 ; SM70-LABEL: test_fpext_float(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<3>;
-; SM70-NEXT:    .reg .b32 %f<2>;
+; SM70-NEXT:    .reg .b32 %r<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b16 %r1, [test_fpext_float_param_0];
-; SM70-NEXT:    shl.b32 %r2, %r1, 16;
-; SM70-NEXT:    mov.b32 %f1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %f1;
+; SM70-NEXT:    shl.b32 %r4, %r1, 16;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r4;
 ; SM70-NEXT:    ret;
 ;
 ; SM80-LABEL: test_fpext_float(
 ; SM80:       {
 ; SM80-NEXT:    .reg .b16 %rs<2>;
-; SM80-NEXT:    .reg .b32 %f<2>;
+; SM80-NEXT:    .reg .b32 %r<2>;
 ; SM80-EMPTY:
 ; SM80-NEXT:  // %bb.0:
 ; SM80-NEXT:    ld.param.b16 %rs1, [test_fpext_float_param_0];
-; SM80-NEXT:    cvt.f32.bf16 %f1, %rs1;
-; SM80-NEXT:    st.param.b32 [func_retval0], %f1;
+; SM80-NEXT:    cvt.f32.bf16 %r1, %rs1;
+; SM80-NEXT:    st.param.b32 [func_retval0], %r1;
 ; SM80-NEXT:    ret;
 ;
 ; SM80-FTZ-LABEL: test_fpext_float(
 ; SM80-FTZ:       {
 ; SM80-FTZ-NEXT:    .reg .b16 %rs<2>;
-; SM80-FTZ-NEXT:    .reg .b32 %f<2>;
+; SM80-FTZ-NEXT:    .reg .b32 %r<2>;
 ; SM80-FTZ-EMPTY:
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.b16 %rs1, [test_fpext_float_param_0];
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f1, %rs1;
-; SM80-FTZ-NEXT:    st.param.b32 [func_retval0], %f1;
+; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r1, %rs1;
+; SM80-FTZ-NEXT:    st.param.b32 [func_retval0], %r1;
 ; SM80-FTZ-NEXT:    ret;
 ;
 ; SM90-LABEL: test_fpext_float(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %f<2>;
+; SM90-NEXT:    .reg .b32 %r<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
 ; SM90-NEXT:    ld.param.b16 %rs1, [test_fpext_float_param_0];
-; SM90-NEXT:    cvt.f32.bf16 %f1, %rs1;
-; SM90-NEXT:    st.param.b32 [func_retval0], %f1;
+; SM90-NEXT:    cvt.f32.bf16 %r1, %rs1;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r1;
 ; SM90-NEXT:    ret;
   %r = fpext bfloat %a to float
   ret float %r
@@ -616,52 +572,50 @@ define bfloat @test_fptrunc_float(float %a) #0 {
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<2>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<7>;
-; SM70-NEXT:    .reg .b32 %f<2>;
+; SM70-NEXT:    .reg .b32 %r<9>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b32 %f1, [test_fptrunc_float_param_0];
-; SM70-NEXT:    mov.b32 %r1, %f1;
-; SM70-NEXT:    bfe.u32 %r2, %r1, 16, 1;
-; SM70-NEXT:    add.s32 %r3, %r2, %r1;
-; SM70-NEXT:    add.s32 %r4, %r3, 32767;
-; SM70-NEXT:    setp.nan.f32 %p1, %f1, %f1;
-; SM70-NEXT:    or.b32 %r5, %r1, 4194304;
-; SM70-NEXT:    selp.b32 %r6, %r5, %r4, %p1;
-; SM70-NEXT:    mov.b32 {_, %rs1}, %r6;
+; SM70-NEXT:    ld.param.b32 %r8, [test_fptrunc_float_param_0];
+; SM70-NEXT:    bfe.u32 %r3, %r8, 16, 1;
+; SM70-NEXT:    add.s32 %r4, %r3, %r8;
+; SM70-NEXT:    add.s32 %r5, %r4, 32767;
+; SM70-NEXT:    setp.nan.f32 %p1, %r8, %r8;
+; SM70-NEXT:    or.b32 %r6, %r8, 4194304;
+; SM70-NEXT:    selp.b32 %r7, %r6, %r5, %p1;
+; SM70-NEXT:    mov.b32 {_, %rs1}, %r7;
 ; SM70-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; SM70-NEXT:    ret;
 ;
 ; SM80-LABEL: test_fptrunc_float(
 ; SM80:       {
 ; SM80-NEXT:    .reg .b16 %rs<2>;
-; SM80-NEXT:    .reg .b32 %f<2>;
+; SM80-NEXT:    .reg .b32 %r<2>;
 ; SM80-EMPTY:
 ; SM80-NEXT:  // %bb.0:
-; SM80-NEXT:    ld.param.b32 %f1, [test_fptrunc_float_param_0];
-; SM80-NEXT:    cvt.rn.bf16.f32 %rs1, %f1;
+; SM80-NEXT:    ld.param.b32 %r1, [test_fptrunc_float_param_0];
+; SM80-NEXT:    cvt.rn.bf16.f32 %rs1, %r1;
 ; SM80-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; SM80-NEXT:    ret;
 ;
 ; SM80-FTZ-LABEL: test_fptrunc_float(
 ; SM80-FTZ:       {
 ; SM80-FTZ-NEXT:    .reg .b16 %rs<2>;
-; SM80-FTZ-NEXT:    .reg .b32 %f<2>;
+; SM80-FTZ-NEXT:    .reg .b32 %r<2>;
 ; SM80-FTZ-EMPTY:
 ; SM80-FTZ-NEXT:  // %bb.0:
-; SM80-FTZ-NEXT:    ld.param.b32 %f1, [test_fptrunc_float_param_0];
-; SM80-FTZ-NEXT:    cvt.rn.bf16.f32 %rs1, %f1;
+; SM80-FTZ-NEXT:    ld.param.b32 %r1, [test_fptrunc_float_param_0];
+; SM80-FTZ-NEXT:    cvt.rn.bf16.f32 %rs1, %r1;
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; SM80-FTZ-NEXT:    ret;
 ;
 ; SM90-LABEL: test_fptrunc_float(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %f<2>;
+; SM90-NEXT:    .reg .b32 %r<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b32 %f1, [test_fptrunc_float_param_0];
-; SM90-NEXT:    cvt.rn.bf16.f32 %rs1, %f1;
+; SM90-NEXT:    ld.param.b32 %r1, [test_fptrunc_float_param_0];
+; SM90-NEXT:    cvt.rn.bf16.f32 %rs1, %r1;
 ; SM90-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; SM90-NEXT:    ret;
   %r = fptrunc float %a to bfloat
@@ -673,22 +627,19 @@ define bfloat @test_fadd_imm_1(bfloat %a) #0 {
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<2>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<9>;
-; SM70-NEXT:    .reg .b32 %f<3>;
+; SM70-NEXT:    .reg .b32 %r<13>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b16 %r1, [test_fadd_imm_1_param_0];
-; SM70-NEXT:    shl.b32 %r2, %r1, 16;
-; SM70-NEXT:    mov.b32 %f1, %r2;
-; SM70-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; SM70-NEXT:    mov.b32 %r3, %f2;
-; SM70-NEXT:    bfe.u32 %r4, %r3, 16, 1;
-; SM70-NEXT:    add.s32 %r5, %r4, %r3;
-; SM70-NEXT:    add.s32 %r6, %r5, 32767;
-; SM70-NEXT:    setp.nan.f32 %p1, %f2, %f2;
-; SM70-NEXT:    or.b32 %r7, %r3, 4194304;
-; SM70-NEXT:    selp.b32 %r8, %r7, %r6, %p1;
-; SM70-NEXT:    mov.b32 {_, %rs1}, %r8;
+; SM70-NEXT:    shl.b32 %r11, %r1, 16;
+; SM70-NEXT:    add.rn.f32 %r12, %r11, 0f3F800000;
+; SM70-NEXT:    bfe.u32 %r6, %r12, 16, 1;
+; SM70-NEXT:    add.s32 %r7, %r6, %r12;
+; SM70-NEXT:    add.s32 %r8, %r7, 32767;
+; SM70-NEXT:    setp.nan.f32 %p1, %r12, %r12;
+; SM70-NEXT:    or.b32 %r9, %r12, 4194304;
+; SM70-NEXT:    selp.b32 %r10, %r9, %r8, %p1;
+; SM70-NEXT:    mov.b32 {_, %rs1}, %r10;
 ; SM70-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; SM70-NEXT:    ret;
 ;
@@ -706,13 +657,13 @@ define bfloat @test_fadd_imm_1(bfloat %a) #0 {
 ; SM80-FTZ-LABEL: test_fadd_imm_1(
 ; SM80-FTZ:       {
 ; SM80-FTZ-NEXT:    .reg .b16 %rs<3>;
-; SM80-FTZ-NEXT:    .reg .b32 %f<3>;
+; SM80-FTZ-NEXT:    .reg .b32 %r<3>;
 ; SM80-FTZ-EMPTY:
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.b16 %rs1, [test_fadd_imm_1_param_0];
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f1, %rs1;
-; SM80-FTZ-NEXT:    add.rn.ftz.f32 %f2, %f1, 0f3F800000;
-; SM80-FTZ-NEXT:    cvt.rn.bf16.f32 %rs2, %f2;
+; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r1, %rs1;
+; SM80-FTZ-NEXT:    add.rn.ftz.f32 %r2, %r1, 0f3F800000;
+; SM80-FTZ-NEXT:    cvt.rn.bf16.f32 %rs2, %r2;
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs2;
 ; SM80-FTZ-NEXT:    ret;
 ;
@@ -735,12 +686,12 @@ define bfloat @test_select_cc_bf16_f64(double %a, double %b, bfloat %c, bfloat %
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<2>;
 ; CHECK-NEXT:    .reg .b16 %rs<4>;
-; CHECK-NEXT:    .reg .b64 %fd<3>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b64 %fd1, [test_select_cc_bf16_f64_param_0];
-; CHECK-NEXT:    ld.param.b64 %fd2, [test_select_cc_bf16_f64_param_1];
-; CHECK-NEXT:    setp.lt.f64 %p1, %fd1, %fd2;
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_select_cc_bf16_f64_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_select_cc_bf16_f64_param_1];
+; CHECK-NEXT:    setp.lt.f64 %p1, %rd1, %rd2;
 ; CHECK-NEXT:    ld.param.b16 %rs1, [test_select_cc_bf16_f64_param_2];
 ; CHECK-NEXT:    ld.param.b16 %rs2, [test_select_cc_bf16_f64_param_3];
 ; CHECK-NEXT:    selp.b16 %rs3, %rs1, %rs2, %p1;
@@ -755,8 +706,7 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
 ; SM70-LABEL: test_extload_bf16x8(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b16 %rs<9>;
-; SM70-NEXT:    .reg .b32 %r<21>;
-; SM70-NEXT:    .reg .b32 %f<9>;
+; SM70-NEXT:    .reg .b32 %r<37>;
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
@@ -767,38 +717,29 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
 ; SM70-NEXT:    mov.b32 {%rs5, %rs6}, %r3;
 ; SM70-NEXT:    mov.b32 {%rs7, %rs8}, %r4;
 ; SM70-NEXT:    cvt.u32.u16 %r5, %rs8;
-; SM70-NEXT:    shl.b32 %r6, %r5, 16;
-; SM70-NEXT:    mov.b32 %f1, %r6;
-; SM70-NEXT:    cvt.u32.u16 %r7, %rs7;
-; SM70-NEXT:    shl.b32 %r8, %r7, 16;
-; SM70-NEXT:    mov.b32 %f2, %r8;
-; SM70-NEXT:    cvt.u32.u16 %r9, %rs6;
-; SM70-NEXT:    shl.b32 %r10, %r9, 16;
-; SM70-NEXT:    mov.b32 %f3, %r10;
-; SM70-NEXT:    cvt.u32.u16 %r11, %rs5;
-; SM70-NEXT:    shl.b32 %r12, %r11, 16;
-; SM70-NEXT:    mov.b32 %f4, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs4;
-; SM70-NEXT:    shl.b32 %r14, %r13, 16;
-; SM70-NEXT:    mov.b32 %f5, %r14;
-; SM70-NEXT:    cvt.u32.u16 %r15, %rs3;
-; SM70-NEXT:    shl.b32 %r16, %r15, 16;
-; SM70-NEXT:    mov.b32 %f6, %r16;
-; SM70-NEXT:    cvt.u32.u16 %r17, %rs2;
-; SM70-NEXT:    shl.b32 %r18, %r17, 16;
-; SM70-NEXT:    mov.b32 %f7, %r18;
-; SM70-NEXT:    cvt.u32.u16 %r19, %rs1;
-; SM70-NEXT:    shl.b32 %r20, %r19, 16;
-; SM70-NEXT:    mov.b32 %f8, %r20;
-; SM70-NEXT:    st.param.v4.b32 [func_retval0], {%f8, %f7, %f6, %f5};
-; SM70-NEXT:    st.param.v4.b32 [func_retval0+16], {%f4, %f3, %f2, %f1};
+; SM70-NEXT:    shl.b32 %r29, %r5, 16;
+; SM70-NEXT:    cvt.u32.u16 %r8, %rs7;
+; SM70-NEXT:    shl.b32 %r30, %r8, 16;
+; SM70-NEXT:    cvt.u32.u16 %r11, %rs6;
+; SM70-NEXT:    shl.b32 %r31, %r11, 16;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs5;
+; SM70-NEXT:    shl.b32 %r32, %r14, 16;
+; SM70-NEXT:    cvt.u32.u16 %r17, %rs4;
+; SM70-NEXT:    shl.b32 %r33, %r17, 16;
+; SM70-NEXT:    cvt.u32.u16 %r20, %rs3;
+; SM70-NEXT:    shl.b32 %r34, %r20, 16;
+; SM70-NEXT:    cvt.u32.u16 %r23, %rs2;
+; SM70-NEXT:    shl.b32 %r35, %r23, 16;
+; SM70-NEXT:    cvt.u32.u16 %r26, %rs1;
+; SM70-NEXT:    shl.b32 %r36, %r26, 16;
+; SM70-NEXT:    st.param.v4.b32 [func_retval0], {%r36, %r35, %r34, %r33};
+; SM70-NEXT:    st.param.v4.b32 [func_retval0+16], {%r32, %r31, %r30, %r29};
 ; SM70-NEXT:    ret;
 ;
 ; SM80-LABEL: test_extload_bf16x8(
 ; SM80:       {
 ; SM80-NEXT:    .reg .b16 %rs<9>;
-; SM80-NEXT:    .reg .b32 %r<5>;
-; SM80-NEXT:    .reg .b32 %f<9>;
+; SM80-NEXT:    .reg .b32 %r<13>;
 ; SM80-NEXT:    .reg .b64 %rd<2>;
 ; SM80-EMPTY:
 ; SM80-NEXT:  // %bb.0:
@@ -808,23 +749,22 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
 ; SM80-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
 ; SM80-NEXT:    mov.b32 {%rs5, %rs6}, %r3;
 ; SM80-NEXT:    mov.b32 {%rs7, %rs8}, %r4;
-; SM80-NEXT:    cvt.f32.bf16 %f1, %rs8;
-; SM80-NEXT:    cvt.f32.bf16 %f2, %rs7;
-; SM80-NEXT:    cvt.f32.bf16 %f3, %rs6;
-; SM80-NEXT:    cvt.f32.bf16 %f4, %rs5;
-; SM80-NEXT:    cvt.f32.bf16 %f5, %rs4;
-; SM80-NEXT:    cvt.f32.bf16 %f6, %rs3;
-; SM80-NEXT:    cvt.f32.bf16 %f7, %rs2;
-; SM80-NEXT:    cvt.f32.bf16 %f8, %rs1;
-; SM80-NEXT:    st.param.v4.b32 [func_retval0], {%f8, %f7, %f6, %f5};
-; SM80-NEXT:    st.param.v4.b32 [func_retval0+16], {%f4, %f3, %f2, %f1};
+; SM80-NEXT:    cvt.f32.bf16 %r5, %rs8;
+; SM80-NEXT:    cvt.f32.bf16 %r6, %rs7;
+; SM80-NEXT:    cvt.f32.bf16 %r7, %rs6;
+; SM80-NEXT:    cvt.f32.bf16 %r8, %rs5;
+; SM80-NEXT:    cvt.f32.bf16 %r9, %rs4;
+; SM80-NEXT:    cvt.f32.bf16 %r10, %rs3;
+; SM80-NEXT:    cvt.f32.bf16 %r11, %rs2;
+; SM80-NEXT:    cvt.f32.bf16 %r12, %rs1;
+; SM80-NEXT:    st.param.v4.b32 [func_retval0], {%r12, %r11, %r10, %r9};
+; SM80-NEXT:    st.param.v4.b32 [func_retval0+16], {%r8, %r7, %r6, %r5};
 ; SM80-NEXT:    ret;
 ;
 ; SM80-FTZ-LABEL: test_extload_bf16x8(
 ; SM80-FTZ:       {
 ; SM80-FTZ-NEXT:    .reg .b16 %rs<9>;
-; SM80-FTZ-NEXT:    .reg .b32 %r<5>;
-; SM80-FTZ-NEXT:    .reg .b32 %f<9>;
+; SM80-FTZ-NEXT:    .reg .b32 %r<13>;
 ; SM80-FTZ-NEXT:    .reg .b64 %rd<2>;
 ; SM80-FTZ-EMPTY:
 ; SM80-FTZ-NEXT:  // %bb.0:
@@ -834,23 +774,22 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
 ; SM80-FTZ-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
 ; SM80-FTZ-NEXT:    mov.b32 {%rs5, %rs6}, %r3;
 ; SM80-FTZ-NEXT:    mov.b32 {%rs7, %rs8}, %r4;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f1, %rs8;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f2, %rs7;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f3, %rs6;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f4, %rs5;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f5, %rs4;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f6, %rs3;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f7, %rs2;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f8, %rs1;
-; SM80-FTZ-NEXT:    st.param.v4.b32 [func_retval0], {%f8, %f7, %f6, %f5};
-; SM80-FTZ-NEXT:    st.param.v4.b32 [func_retval0+16], {%f4, %f3, %f2, %f1};
+; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r5, %rs8;
+; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r6, %rs7;
+; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r7, %rs6;
+; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r8, %rs5;
+; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r9, %rs4;
+; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r10, %rs3;
+; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r11, %rs2;
+; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r12, %rs1;
+; SM80-FTZ-NEXT:    st.param.v4.b32 [func_retval0], {%r12, %r11, %r10, %r9};
+; SM80-FTZ-NEXT:    st.param.v4.b32 [func_retval0+16], {%r8, %r7, %r6, %r5};
 ; SM80-FTZ-NEXT:    ret;
 ;
 ; SM90-LABEL: test_extload_bf16x8(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<9>;
-; SM90-NEXT:    .reg .b32 %r<5>;
-; SM90-NEXT:    .reg .b32 %f<9>;
+; SM90-NEXT:    .reg .b32 %r<13>;
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
@@ -860,16 +799,16 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
 ; SM90-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
 ; SM90-NEXT:    mov.b32 {%rs5, %rs6}, %r3;
 ; SM90-NEXT:    mov.b32 {%rs7, %rs8}, %r4;
-; SM90-NEXT:    cvt.f32.bf16 %f1, %rs8;
-; SM90-NEXT:    cvt.f32.bf16 %f2, %rs7;
-; SM90-NEXT:    cvt.f32.bf16 %f3, %rs6;
-; SM90-NEXT:    cvt.f32.bf16 %f4, %rs5;
-; SM90-NEXT:    cvt.f32.bf16 %f5, %rs4;
-; SM90-NEXT:    cvt.f32.bf16 %f6, %rs3;
-; SM90-NEXT:    cvt.f32.bf16 %f7, %rs2;
-; SM90-NEXT:    cvt.f32.bf16 %f8, %rs1;
-; SM90-NEXT:    st.param.v4.b32 [func_retval0], {%f8, %f7, %f6, %f5};
-; SM90-NEXT:    st.param.v4.b32 [func_retval0+16], {%f4, %f3, %f2, %f1};
+; SM90-NEXT:    cvt.f32.bf16 %r5, %rs8;
+; SM90-NEXT:    cvt.f32.bf16 %r6, %rs7;
+; SM90-NEXT:    cvt.f32.bf16 %r7, %rs6;
+; SM90-NEXT:    cvt.f32.bf16 %r8, %rs5;
+; SM90-NEXT:    cvt.f32.bf16 %r9, %rs4;
+; SM90-NEXT:    cvt.f32.bf16 %r10, %rs3;
+; SM90-NEXT:    cvt.f32.bf16 %r11, %rs2;
+; SM90-NEXT:    cvt.f32.bf16 %r12, %rs1;
+; SM90-NEXT:    st.param.v4.b32 [func_retval0], {%r12, %r11, %r10, %r9};
+; SM90-NEXT:    st.param.v4.b32 [func_retval0+16], {%r8, %r7, %r6, %r5};
 ; SM90-NEXT:    ret;
   %load = load <8 x bfloat>, ptr addrspace(3) %arg, align 16
   %res = fpext <8 x bfloat> %load to <8 x float>
@@ -880,44 +819,40 @@ define i16 @test_fptosi_i16(bfloat %a) {
 ; SM70-LABEL: test_fptosi_i16(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b32 %f<2>;
+; SM70-NEXT:    .reg .b32 %r<6>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b16 %r1, [test_fptosi_i16_param_0];
-; SM70-NEXT:    shl.b32 %r2, %r1, 16;
-; SM70-NEXT:    mov.b32 %f1, %r2;
-; SM70-NEXT:    cvt.rzi.s16.f32 %rs1, %f1;
-; SM70-NEXT:    cvt.u32.u16 %r3, %rs1;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM70-NEXT:    shl.b32 %r5, %r1, 16;
+; SM70-NEXT:    cvt.rzi.s16.f32 %rs1, %r5;
+; SM70-NEXT:    cvt.u32.u16 %r4, %rs1;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r4;
 ; SM70-NEXT:    ret;
 ;
 ; SM80-LABEL: test_fptosi_i16(
 ; SM80:       {
 ; SM80-NEXT:    .reg .b16 %rs<3>;
-; SM80-NEXT:    .reg .b32 %r<2>;
-; SM80-NEXT:    .reg .b32 %f<2>;
+; SM80-NEXT:    .reg .b32 %r<3>;
 ; SM80-EMPTY:
 ; SM80-NEXT:  // %bb.0:
 ; SM80-NEXT:    ld.param.b16 %rs1, [test_fptosi_i16_param_0];
-; SM80-NEXT:    cvt.f32.bf16 %f1, %rs1;
-; SM80-NEXT:    cvt.rzi.s16.f32 %rs2, %f1;
-; SM80-NEXT:    cvt.u32.u16 %r1, %rs2;
-; SM80-NEXT:    st.param.b32 [func_retval0], %r1;
+; SM80-NEXT:    cvt.f32.bf16 %r1, %rs1;
+; SM80-NEXT:    cvt.rzi.s16.f32 %rs2, %r1;
+; SM80-NEXT:    cvt.u32.u16 %r2, %rs2;
+; SM80-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM80-NEXT:    ret;
 ;
 ; SM80-FTZ-LABEL: test_fptosi_i16(
 ; SM80-FTZ:       {
 ; SM80-FTZ-NEXT:    .reg .b16 %rs<3>;
-; SM80-FTZ-NEXT:    .reg .b32 %r<2>;
-; SM80-FTZ-NEXT:    .reg .b32 %f<2>;
+; SM80-FTZ-NEXT:    .reg .b32 %r<3>;
 ; SM80-FTZ-EMPTY:
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.b16 %rs1, [test_fptosi_i16_param_0];
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f1, %rs1;
-; SM80-FTZ-NEXT:    cvt.rzi.ftz.s16.f32 %rs2, %f1;
-; SM80-FTZ-NEXT:    cvt.u32.u16 %r1, %rs2;
-; SM80-FTZ-NEXT:    st.param.b32 [func_retval0], %r1;
+; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r1, %rs1;
+; SM80-FTZ-NEXT:    cvt.rzi.ftz.s16.f32 %rs2, %r1;
+; SM80-FTZ-NEXT:    cvt.u32.u16 %r2, %rs2;
+; SM80-FTZ-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM80-FTZ-NEXT:    ret;
 ;
 ; SM90-LABEL: test_fptosi_i16(
@@ -939,44 +874,40 @@ define i16 @test_fptoui_i16(bfloat %a) {
 ; SM70-LABEL: test_fptoui_i16(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b32 %f<2>;
+; SM70-NEXT:    .reg .b32 %r<6>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b16 %r1, [test_fptoui_i16_param_0];
-; SM70-NEXT:    shl.b32 %r2, %r1, 16;
-; SM70-NEXT:    mov.b32 %f1, %r2;
-; SM70-NEXT:    cvt.rzi.u16.f32 %rs1, %f1;
-; SM70-NEXT:    cvt.u32.u16 %r3, %rs1;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM70-NEXT:    shl.b32 %r5, %r1, 16;
+; SM70-NEXT:    cvt.rzi.u16.f32 %rs1, %r5;
+; SM70-NEXT:    cvt.u32.u16 %r4, %rs1;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r4;
 ; SM70-NEXT:    ret;
 ;
 ; SM80-LABEL: test_fptoui_i16(
 ; SM80:       {
 ; SM80-NEXT:    .reg .b16 %rs<3>;
-; SM80-NEXT:    .reg .b32 %r<2>;
-; SM80-NEXT:    .reg .b32 %f<2>;
+; SM80-NEXT:    .reg .b32 %r<3>;
 ; SM80-EMPTY:
 ; SM80-NEXT:  // %bb.0:
 ; SM80-NEXT:    ld.param.b16 %rs1, [test_fptoui_i16_param_0];
-; SM80-NEXT:    cvt.f32.bf16 %f1, %rs1;
-; SM80-NEXT:    cvt.rzi.u16.f32 %rs2, %f1;
-; SM80-NEXT:    cvt.u32.u16 %r1, %rs2;
-; SM80-NEXT:    st.param.b32 [func_retval0], %r1;
+; SM80-NEXT:    cvt.f32.bf16 %r1, %rs1;
+; SM80-NEXT:    cvt.rzi.u16.f32 %rs2, %r1;
+; SM80-NEXT:    cvt.u32.u16 %r2, %rs2;
+; SM80-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM80-NEXT:    ret;
 ;
 ; SM80-FTZ-LABEL: test_fptoui_i16(
 ; SM80-FTZ:       {
 ; SM80-FTZ-NEXT:    .reg .b16 %rs<3>;
-; SM80-FTZ-NEXT:    .reg .b32 %r<2>;
-; SM80-FTZ-NEXT:    .reg .b32 %f<2>;
+; SM80-FTZ-NEXT:    .reg .b32 %r<3>;
 ; SM80-FTZ-EMPTY:
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.b16 %rs1, [test_fptoui_i16_param_0];
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f1, %rs1;
-; SM80-FTZ-NEXT:    cvt.rzi.ftz.u16.f32 %rs2, %f1;
-; SM80-FTZ-NEXT:    cvt.u32.u16 %r1, %rs2;
-; SM80-FTZ-NEXT:    st.param.b32 [func_retval0], %r1;
+; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r1, %rs1;
+; SM80-FTZ-NEXT:    cvt.rzi.ftz.u16.f32 %rs2, %r1;
+; SM80-FTZ-NEXT:    cvt.u32.u16 %r2, %rs2;
+; SM80-FTZ-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM80-FTZ-NEXT:    ret;
 ;
 ; SM90-LABEL: test_fptoui_i16(
@@ -999,44 +930,42 @@ define bfloat @test_sitofp_i16(i16 %a) {
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<2>;
 ; SM70-NEXT:    .reg .b16 %rs<3>;
-; SM70-NEXT:    .reg .b32 %r<7>;
-; SM70-NEXT:    .reg .b32 %f<2>;
+; SM70-NEXT:    .reg .b32 %r<9>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b16 %rs1, [test_sitofp_i16_param_0];
-; SM70-NEXT:    cvt.rn.f32.s16 %f1, %rs1;
-; SM70-NEXT:    mov.b32 %r1, %f1;
-; SM70-NEXT:    bfe.u32 %r2, %r1, 16, 1;
-; SM70-NEXT:    add.s32 %r3, %r2, %r1;
-; SM70-NEXT:    add.s32 %r4, %r3, 32767;
-; SM70-NEXT:    setp.nan.f32 %p1, %f1, %f1;
-; SM70-NEXT:    or.b32 %r5, %r1, 4194304;
-; SM70-NEXT:    selp.b32 %r6, %r5, %r4, %p1;
-; SM70-NEXT:    mov.b32 {_, %rs2}, %r6;
+; SM70-NEXT:    cvt.rn.f32.s16 %r8, %rs1;
+; SM70-NEXT:    bfe.u32 %r3, %r8, 16, 1;
+; SM70-NEXT:    add.s32 %r4, %r3, %r8;
+; SM70-NEXT:    add.s32 %r5, %r4, 32767;
+; SM70-NEXT:    setp.nan.f32 %p1, %r8, %r8;
+; SM70-NEXT:    or.b32 %r6, %r8, 4194304;
+; SM70-NEXT:    selp.b32 %r7, %r6, %r5, %p1;
+; SM70-NEXT:    mov.b32 {_, %rs2}, %r7;
 ; SM70-NEXT:    st.param.b16 [func_retval0], %rs2;
 ; SM70-NEXT:    ret;
 ;
 ; SM80-LABEL: test_sitofp_i16(
 ; SM80:       {
 ; SM80-NEXT:    .reg .b16 %rs<3>;
-; SM80-NEXT:    .reg .b32 %f<2>;
+; SM80-NEXT:    .reg .b32 %r<2>;
 ; SM80-EMPTY:
 ; SM80-NEXT:  // %bb.0:
 ; SM80-NEXT:    ld.param.b16 %rs1, [test_sitofp_i16_param_0];
-; SM80-NEXT:    cvt.rn.f32.s16 %f1, %rs1;
-; SM80-NEXT:    cvt.rn.bf16.f32 %rs2, %f1;
+; SM80-NEXT:    cvt.rn.f32.s16 %r1, %rs1;
+; SM80-NEXT:    cvt.rn.bf16.f32 %rs2, %r1;
 ; SM80-NEXT:    st.param.b16 [func_retval0], %rs2;
 ; SM80-NEXT:    ret;
 ;
 ; SM80-FTZ-LABEL: test_sitofp_i16(
 ; SM80-FTZ:       {
 ; SM80-FTZ-NEXT:    .reg .b16 %rs<3>;
-; SM80-FTZ-NEXT:    .reg .b32 %f<2>;
+; SM80-FTZ-NEXT:    .reg .b32 %r<2>;
 ; SM80-FTZ-EMPTY:
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.b16 %rs1, [test_sitofp_i16_param_0];
-; SM80-FTZ-NEXT:    cvt.rn.f32.s16 %f1, %rs1;
-; SM80-FTZ-NEXT:    cvt.rn.bf16.f32 %rs2, %f1;
+; SM80-FTZ-NEXT:    cvt.rn.f32.s16 %r1, %rs1;
+; SM80-FTZ-NEXT:    cvt.rn.bf16.f32 %rs2, %r1;
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs2;
 ; SM80-FTZ-NEXT:    ret;
 ;
@@ -1058,44 +987,42 @@ define bfloat @test_uitofp_i8(i8 %a) {
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<2>;
 ; SM70-NEXT:    .reg .b16 %rs<3>;
-; SM70-NEXT:    .reg .b32 %r<7>;
-; SM70-NEXT:    .reg .b32 %f<2>;
+; SM70-NEXT:    .reg .b32 %r<9>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b8 %rs1, [test_uitofp_i8_param_0];
-; SM70-NEXT:    cvt.rn.f32.u16 %f1, %rs1;
-; SM70-NEXT:    mov.b32 %r1, %f1;
-; SM70-NEXT:    bfe.u32 %r2, %r1, 16, 1;
-; SM70-NEXT:    add.s32 %r3, %r2, %r1;
-; SM70-NEXT:    add.s32 %r4, %r3, 32767;
-; SM70-NEXT:    setp.nan.f32 %p1, %f1, %f1;
-; SM70-NEXT:    or.b32 %r5, %r1, 4194304;
-; SM70-NEXT:    selp.b32 %r6, %r5, %r4, %p1;
-; SM70-NEXT:    mov.b32 {_, %rs2}, %r6;
+; SM70-NEXT:    cvt.rn.f32.u16 %r8, %rs1;
+; SM70-NEXT:    bfe.u32 %r3, %r8, 16, 1;
+; SM70-NEXT:    add.s32 %r4, %r3, %r8;
+; SM70-NEXT:    add.s32 %r5, %r4, 32767;
+; SM70-NEXT:    setp.nan.f32 %p1, %r8, %r8;
+; SM70-NEXT:    or.b32 %r6, %r8, 4194304;
+; SM70-NEXT:    selp.b32 %r7, %r6, %r5, %p1;
+; SM70-NEXT:    mov.b32 {_, %rs2}, %r7;
 ; SM70-NEXT:    st.param.b16 [func_retval0], %rs2;
 ; SM70-NEXT:    ret;
 ;
 ; SM80-LABEL: test_uitofp_i8(
 ; SM80:       {
 ; SM80-NEXT:    .reg .b16 %rs<3>;
-; SM80-NEXT:    .reg .b32 %f<2>;
+; SM80-NEXT:    .reg .b32 %r<2>;
 ; SM80-EMPTY:
 ; SM80-NEXT:  // %bb.0:
 ; SM80-NEXT:    ld.param.b8 %rs1, [test_uitofp_i8_param_0];
-; SM80-NEXT:    cvt.rn.f32.u16 %f1, %rs1;
-; SM80-NEXT:    cvt.rn.bf16.f32 %rs2, %f1;
+; SM80-NEXT:    cvt.rn.f32.u16 %r1, %rs1;
+; SM80-NEXT:    cvt.rn.bf16.f32 %rs2, %r1;
 ; SM80-NEXT:    st.param.b16 [func_retval0], %rs2;
 ; SM80-NEXT:    ret;
 ;
 ; SM80-FTZ-LABEL: test_uitofp_i8(
 ; SM80-FTZ:       {
 ; SM80-FTZ-NEXT:    .reg .b16 %rs<3>;
-; SM80-FTZ-NEXT:    .reg .b32 %f<2>;
+; SM80-FTZ-NEXT:    .reg .b32 %r<2>;
 ; SM80-FTZ-EMPTY:
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.b8 %rs1, [test_uitofp_i8_param_0];
-; SM80-FTZ-NEXT:    cvt.rn.f32.u16 %f1, %rs1;
-; SM80-FTZ-NEXT:    cvt.rn.bf16.f32 %rs2, %f1;
+; SM80-FTZ-NEXT:    cvt.rn.f32.u16 %r1, %rs1;
+; SM80-FTZ-NEXT:    cvt.rn.bf16.f32 %rs2, %r1;
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs2;
 ; SM80-FTZ-NEXT:    ret;
 ;
@@ -1117,23 +1044,21 @@ define bfloat @test_uitofp_i1(i1 %a) {
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<4>;
-; SM70-NEXT:    .reg .b32 %r<8>;
-; SM70-NEXT:    .reg .b32 %f<2>;
+; SM70-NEXT:    .reg .b32 %r<10>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b8 %rs1, [test_uitofp_i1_param_0];
 ; SM70-NEXT:    and.b16 %rs2, %rs1, 1;
 ; SM70-NEXT:    setp.ne.b16 %p1, %rs2, 0;
 ; SM70-NEXT:    selp.b32 %r1, 1, 0, %p1;
-; SM70-NEXT:    cvt.rn.f32.u32 %f1, %r1;
-; SM70-NEXT:    mov.b32 %r2, %f1;
-; SM70-NEXT:    bfe.u32 %r3, %r2, 16, 1;
-; SM70-NEXT:    add.s32 %r4, %r3, %r2;
-; SM70-NEXT:    add.s32 %r5, %r4, 32767;
-; SM70-NEXT:    setp.nan.f32 %p2, %f1, %f1;
-; SM70-NEXT:    or.b32 %r6, %r2, 4194304;
-; SM70-NEXT:    selp.b32 %r7, %r6, %r5, %p2;
-; SM70-NEXT:    mov.b32 {_, %rs3}, %r7;
+; SM70-NEXT:    cvt.rn.f32.u32 %r9, %r1;
+; SM70-NEXT:    bfe.u32 %r4, %r9, 16, 1;
+; SM70-NEXT:    add.s32 %r5, %r4, %r9;
+; SM70-NEXT:    add.s32 %r6, %r5, 32767;
+; SM70-NEXT:    setp.nan.f32 %p2, %r9, %r9;
+; SM70-NEXT:    or.b32 %r7, %r9, 4194304;
+; SM70-NEXT:    selp.b32 %r8, %r7, %r6, %p2;
+; SM70-NEXT:    mov.b32 {_, %rs3}, %r8;
 ; SM70-NEXT:    st.param.b16 [func_retval0], %rs3;
 ; SM70-NEXT:    ret;
 ;
@@ -1141,16 +1066,15 @@ define bfloat @test_uitofp_i1(i1 %a) {
 ; SM80:       {
 ; SM80-NEXT:    .reg .pred %p<2>;
 ; SM80-NEXT:    .reg .b16 %rs<4>;
-; SM80-NEXT:    .reg .b32 %r<2>;
-; SM80-NEXT:    .reg .b32 %f<2>;
+; SM80-NEXT:    .reg .b32 %r<3>;
 ; SM80-EMPTY:
 ; SM80-NEXT:  // %bb.0:
 ; SM80-NEXT:    ld.param.b8 %rs1, [test_uitofp_i1_param_0];
 ; SM80-NEXT:    and.b16 %rs2, %rs1, 1;
 ; SM80-NEXT:    setp.ne.b16 %p1, %rs2, 0;
 ; SM80-NEXT:    selp.b32 %r1, 1, 0, %p1;
-; SM80-NEXT:    cvt.rn.f32.u32 %f1, %r1;
-; SM80-NEXT:    cvt.rn.bf16.f32 %rs3, %f1;
+; SM80-NEXT:    cvt.rn.f32.u32 %r2, %r1;
+; SM80-NEXT:    cvt.rn.bf16.f32 %rs3, %r2;
 ; SM80-NEXT:    st.param.b16 [func_retval0], %rs3;
 ; SM80-NEXT:    ret;
 ;
@@ -1158,16 +1082,15 @@ define bfloat @test_uitofp_i1(i1 %a) {
 ; SM80-FTZ:       {
 ; SM80-FTZ-NEXT:    .reg .pred %p<2>;
 ; SM80-FTZ-NEXT:    .reg .b16 %rs<4>;
-; SM80-FTZ-NEXT:    .reg .b32 %r<2>;
-; SM80-FTZ-NEXT:    .reg .b32 %f<2>;
+; SM80-FTZ-NEXT:    .reg .b32 %r<3>;
 ; SM80-FTZ-EMPTY:
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.b8 %rs1, [test_uitofp_i1_param_0];
 ; SM80-FTZ-NEXT:    and.b16 %rs2, %rs1, 1;
 ; SM80-FTZ-NEXT:    setp.ne.b16 %p1, %rs2, 0;
 ; SM80-FTZ-NEXT:    selp.b32 %r1, 1, 0, %p1;
-; SM80-FTZ-NEXT:    cvt.rn.f32.u32 %f1, %r1;
-; SM80-FTZ-NEXT:    cvt.rn.bf16.f32 %rs3, %f1;
+; SM80-FTZ-NEXT:    cvt.rn.f32.u32 %r2, %r1;
+; SM80-FTZ-NEXT:    cvt.rn.bf16.f32 %rs3, %r2;
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs3;
 ; SM80-FTZ-NEXT:    ret;
 ;
@@ -1194,44 +1117,42 @@ define bfloat @test_uitofp_i16(i16 %a) {
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<2>;
 ; SM70-NEXT:    .reg .b16 %rs<3>;
-; SM70-NEXT:    .reg .b32 %r<7>;
-; SM70-NEXT:    .reg .b32 %f<2>;
+; SM70-NEXT:    .reg .b32 %r<9>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b16 %rs1, [test_uitofp_i16_param_0];
-; SM70-NEXT:    cvt.rn.f32.u16 %f1, %rs1;
-; SM70-NEXT:    mov.b32 %r1, %f1;
-; SM70-NEXT:    bfe.u32 %r2, %r1, 16, 1;
-; SM70-NEXT:    add.s32 %r3, %r2, %r1;
-; SM70-NEXT:    add.s32 %r4, %r3, 32767;
-; SM70-NEXT:    setp.nan.f32 %p1, %f1, %f1;
-; SM70-NEXT:    or.b32 %r5, %r1, 4194304;
-; SM70-NEXT:    selp.b32 %r6, %r5, %r4, %p1;
-; SM70-NEXT:    mov.b32 {_, %rs2}, %r6;
+; SM70-NEXT:    cvt.rn.f32.u16 %r8, %rs1;
+; SM70-NEXT:    bfe.u32 %r3, %r8, 16, 1;
+; SM70-NEXT:    add.s32 %r4, %r3, %r8;
+; SM70-NEXT:    add.s32 %r5, %r4, 32767;
+; SM70-NEXT:    setp.nan.f32 %p1, %r8, %r8;
+; SM70-NEXT:    or.b32 %r6, %r8, 4194304;
+; SM70-NEXT:    selp.b32 %r7, %r6, %r5, %p1;
+; SM70-NEXT:    mov.b32 {_, %rs2}, %r7;
 ; SM70-NEXT:    st.param.b16 [func_retval0], %rs2;
 ; SM70-NEXT:    ret;
 ;
 ; SM80-LABEL: test_uitofp_i16(
 ; SM80:       {
 ; SM80-NEXT:    .reg .b16 %rs<3>;
-; SM80-NEXT:    .reg .b32 %f<2>;
+; SM80-NEXT:    .reg .b32 %r<2>;
 ; SM80-EMPTY:
 ; SM80-NEXT:  // %bb.0:
 ; SM80-NEXT:    ld.param.b16 %rs1, [test_uitofp_i16_param_0];
-; SM80-NEXT:    cvt.rn.f32.u16 %f1, %rs1;
-; SM80-NEXT:    cvt.rn.bf16.f32 %rs2, %f1;
+; SM80-NEXT:    cvt.rn.f32.u16 %r1, %rs1;
+; SM80-NEXT:    cvt.rn.bf16.f32 %rs2, %r1;
 ; SM80-NEXT:    st.param.b16 [func_retval0], %rs2;
 ; SM80-NEXT:    ret;
 ;
 ; SM80-FTZ-LABEL: test_uitofp_i16(
 ; SM80-FTZ:       {
 ; SM80-FTZ-NEXT:    .reg .b16 %rs<3>;
-; SM80-FTZ-NEXT:    .reg .b32 %f<2>;
+; SM80-FTZ-NEXT:    .reg .b32 %r<2>;
 ; SM80-FTZ-EMPTY:
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.b16 %rs1, [test_uitofp_i16_param_0];
-; SM80-FTZ-NEXT:    cvt.rn.f32.u16 %f1, %rs1;
-; SM80-FTZ-NEXT:    cvt.rn.bf16.f32 %rs2, %f1;
+; SM80-FTZ-NEXT:    cvt.rn.f32.u16 %r1, %rs1;
+; SM80-FTZ-NEXT:    cvt.rn.bf16.f32 %rs2, %r1;
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs2;
 ; SM80-FTZ-NEXT:    ret;
 ;
@@ -1253,46 +1174,42 @@ define bfloat @test_uitofp_i32(i32 %a) {
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<2>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<8>;
-; SM70-NEXT:    .reg .b32 %f<2>;
+; SM70-NEXT:    .reg .b32 %r<10>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b32 %r1, [test_uitofp_i32_param_0];
-; SM70-NEXT:    cvt.rn.f32.u32 %f1, %r1;
-; SM70-NEXT:    mov.b32 %r2, %f1;
-; SM70-NEXT:    bfe.u32 %r3, %r2, 16, 1;
-; SM70-NEXT:    add.s32 %r4, %r3, %r2;
-; SM70-NEXT:    add.s32 %r5, %r4, 32767;
-; SM70-NEXT:    setp.nan.f32 %p1, %f1, %f1;
-; SM70-NEXT:    or.b32 %r6, %r2, 4194304;
-; SM70-NEXT:    selp.b32 %r7, %r6, %r5, %p1;
-; SM70-NEXT:    mov.b32 {_, %rs1}, %r7;
+; SM70-NEXT:    cvt.rn.f32.u32 %r9, %r1;
+; SM70-NEXT:    bfe.u32 %r4, %r9, 16, 1;
+; SM70-NEXT:    add.s32 %r5, %r4, %r9;
+; SM70-NEXT:    add.s32 %r6, %r5, 32767;
+; SM70-NEXT:    setp.nan.f32 %p1, %r9, %r9;
+; SM70-NEXT:    or.b32 %r7, %r9, 4194304;
+; SM70-NEXT:    selp.b32 %r8, %r7, %r6, %p1;
+; SM70-NEXT:    mov.b32 {_, %rs1}, %r8;
 ; SM70-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; SM70-NEXT:    ret;
 ;
 ; SM80-LABEL: test_uitofp_i32(
 ; SM80:       {
 ; SM80-NEXT:    .reg .b16 %rs<2>;
-; SM80-NEXT:    .reg .b32 %r<2>;
-; SM80-NEXT:    .reg .b32 %f<2>;
+; SM80-NEXT:    .reg .b32 %r<3>;
 ; SM80-EMPTY:
 ; SM80-NEXT:  // %bb.0:
 ; SM80-NEXT:    ld.param.b32 %r1, [test_uitofp_i32_param_0];
-; SM80-NEXT:    cvt.rn.f32.u32 %f1, %r1;
-; SM80-NEXT:    cvt.rn.bf16.f32 %rs1, %f1;
+; SM80-NEXT:    cvt.rn.f32.u32 %r2, %r1;
+; SM80-NEXT:    cvt.rn.bf16.f32 %rs1, %r2;
 ; SM80-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; SM80-NEXT:    ret;
 ;
 ; SM80-FTZ-LABEL: test_uitofp_i32(
 ; SM80-FTZ:       {
 ; SM80-FTZ-NEXT:    .reg .b16 %rs<2>;
-; SM80-FTZ-NEXT:    .reg .b32 %r<2>;
-; SM80-FTZ-NEXT:    .reg .b32 %f<2>;
+; SM80-FTZ-NEXT:    .reg .b32 %r<3>;
 ; SM80-FTZ-EMPTY:
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.b32 %r1, [test_uitofp_i32_param_0];
-; SM80-FTZ-NEXT:    cvt.rn.f32.u32 %f1, %r1;
-; SM80-FTZ-NEXT:    cvt.rn.bf16.f32 %rs1, %f1;
+; SM80-FTZ-NEXT:    cvt.rn.f32.u32 %r2, %r1;
+; SM80-FTZ-NEXT:    cvt.rn.bf16.f32 %rs1, %r2;
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; SM80-FTZ-NEXT:    ret;
 ;
@@ -1315,47 +1232,45 @@ define bfloat @test_uitofp_i64(i64 %a) {
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<2>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<7>;
-; SM70-NEXT:    .reg .b32 %f<2>;
+; SM70-NEXT:    .reg .b32 %r<9>;
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b64 %rd1, [test_uitofp_i64_param_0];
-; SM70-NEXT:    cvt.rn.f32.u64 %f1, %rd1;
-; SM70-NEXT:    mov.b32 %r1, %f1;
-; SM70-NEXT:    bfe.u32 %r2, %r1, 16, 1;
-; SM70-NEXT:    add.s32 %r3, %r2, %r1;
-; SM70-NEXT:    add.s32 %r4, %r3, 32767;
-; SM70-NEXT:    setp.nan.f32 %p1, %f1, %f1;
-; SM70-NEXT:    or.b32 %r5, %r1, 4194304;
-; SM70-NEXT:    selp.b32 %r6, %r5, %r4, %p1;
-; SM70-NEXT:    mov.b32 {_, %rs1}, %r6;
+; SM70-NEXT:    cvt.rn.f32.u64 %r8, %rd1;
+; SM70-NEXT:    bfe.u32 %r3, %r8, 16, 1;
+; SM70-NEXT:    add.s32 %r4, %r3, %r8;
+; SM70-NEXT:    add.s32 %r5, %r4, 32767;
+; SM70-NEXT:    setp.nan.f32 %p1, %r8, %r8;
+; SM70-NEXT:    or.b32 %r6, %r8, 4194304;
+; SM70-NEXT:    selp.b32 %r7, %r6, %r5, %p1;
+; SM70-NEXT:    mov.b32 {_, %rs1}, %r7;
 ; SM70-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; SM70-NEXT:    ret;
 ;
 ; SM80-LABEL: test_uitofp_i64(
 ; SM80:       {
 ; SM80-NEXT:    .reg .b16 %rs<2>;
-; SM80-NEXT:    .reg .b32 %f<2>;
+; SM80-NEXT:    .reg .b32 %r<2>;
 ; SM80-NEXT:    .reg .b64 %rd<2>;
 ; SM80-EMPTY:
 ; SM80-NEXT:  // %bb.0:
 ; SM80-NEXT:    ld.param.b64 %rd1, [test_uitofp_i64_param_0];
-; SM80-NEXT:    cvt.rn.f32.u64 %f1, %rd1;
-; SM80-NEXT:    cvt.rn.bf16.f32 %rs1, %f1;
+; SM80-NEXT:    cvt.rn.f32.u64 %r1, %rd1;
+; SM80-NEXT:    cvt.rn.bf16.f32 %rs1, %r1;
 ; SM80-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; SM80-NEXT:    ret;
 ;
 ; SM80-FTZ-LABEL: test_uitofp_i64(
 ; SM80-FTZ:       {
 ; SM80-FTZ-NEXT:    .reg .b16 %rs<2>;
-; SM80-FTZ-NEXT:    .reg .b32 %f<2>;
+; SM80-FTZ-NEXT:    .reg .b32 %r<2>;
 ; SM80-FTZ-NEXT:    .reg .b64 %rd<2>;
 ; SM80-FTZ-EMPTY:
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.b64 %rd1, [test_uitofp_i64_param_0];
-; SM80-FTZ-NEXT:    cvt.rn.f32.u64 %f1, %rd1;
-; SM80-FTZ-NEXT:    cvt.rn.bf16.f32 %rs1, %f1;
+; SM80-FTZ-NEXT:    cvt.rn.f32.u64 %r1, %rd1;
+; SM80-FTZ-NEXT:    cvt.rn.bf16.f32 %rs1, %r1;
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; SM80-FTZ-NEXT:    ret;
 ;
@@ -1378,48 +1293,45 @@ define bfloat @test_roundeven(bfloat %a) {
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<2>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<9>;
-; SM70-NEXT:    .reg .b32 %f<3>;
+; SM70-NEXT:    .reg .b32 %r<13>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b16 %r1, [test_roundeven_param_0];
-; SM70-NEXT:    shl.b32 %r2, %r1, 16;
-; SM70-NEXT:    mov.b32 %f1, %r2;
-; SM70-NEXT:    cvt.rni.f32.f32 %f2, %f1;
-; SM70-NEXT:    mov.b32 %r3, %f2;
-; SM70-NEXT:    bfe.u32 %r4, %r3, 16, 1;
-; SM70-NEXT:    add.s32 %r5, %r4, %r3;
-; SM70-NEXT:    add.s32 %r6, %r5, 32767;
-; SM70-NEXT:    setp.nan.f32 %p1, %f2, %f2;
-; SM70-NEXT:    or.b32 %r7, %r3, 4194304;
-; SM70-NEXT:    selp.b32 %r8, %r7, %r6, %p1;
-; SM70-NEXT:    mov.b32 {_, %rs1}, %r8;
+; SM70-NEXT:    shl.b32 %r11, %r1, 16;
+; SM70-NEXT:    cvt.rni.f32.f32 %r12, %r11;
+; SM70-NEXT:    bfe.u32 %r6, %r12, 16, 1;
+; SM70-NEXT:    add.s32 %r7, %r6, %r12;
+; SM70-NEXT:    add.s32 %r8, %r7, 32767;
+; SM70-NEXT:    setp.nan.f32 %p1, %r12, %r12;
+; SM70-NEXT:    or.b32 %r9, %r12, 4194304;
+; SM70-NEXT:    selp.b32 %r10, %r9, %r8, %p1;
+; SM70-NEXT:    mov.b32 {_, %rs1}, %r10;
 ; SM70-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; SM70-NEXT:    ret;
 ;
 ; SM80-LABEL: test_roundeven(
 ; SM80:       {
 ; SM80-NEXT:    .reg .b16 %rs<3>;
-; SM80-NEXT:    .reg .b32 %f<3>;
+; SM80-NEXT:    .reg .b32 %r<3>;
 ; SM80-EMPTY:
 ; SM80-NEXT:  // %bb.0:
 ; SM80-NEXT:    ld.param.b16 %rs1, [test_roundeven_param_0];
-; SM80-NEXT:    cvt.f32.bf16 %f1, %rs1;
-; SM80-NEXT:    cvt.rni.f32.f32 %f2, %f1;
-; SM80-NEXT:    cvt.rn.bf16.f32 %rs2, %f2;
+; SM80-NEXT:    cvt.f32.bf16 %r1, %rs1;
+; SM80-NEXT:    cvt.rni.f32.f32 %r2, %r1;
+; SM80-NEXT:    cvt.rn.bf16.f32 %rs2, %r2;
 ; SM80-NEXT:    st.param.b16 [func_retval0], %rs2;
 ; SM80-NEXT:    ret;
 ;
 ; SM80-FTZ-LABEL: test_roundeven(
 ; SM80-FTZ:       {
 ; SM80-FTZ-NEXT:    .reg .b16 %rs<3>;
-; SM80-FTZ-NEXT:    .reg .b32 %f<3>;
+; SM80-FTZ-NEXT:    .reg .b32 %r<3>;
 ; SM80-FTZ-EMPTY:
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.b16 %rs1, [test_roundeven_param_0];
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f1, %rs1;
-; SM80-FTZ-NEXT:    cvt.rni.ftz.f32.f32 %f2, %f1;
-; SM80-FTZ-NEXT:    cvt.rn.bf16.f32 %rs2, %f2;
+; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r1, %rs1;
+; SM80-FTZ-NEXT:    cvt.rni.ftz.f32.f32 %r2, %r1;
+; SM80-FTZ-NEXT:    cvt.rn.bf16.f32 %rs2, %r2;
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs2;
 ; SM80-FTZ-NEXT:    ret;
 ;
@@ -1441,30 +1353,26 @@ define bfloat @test_maximum(bfloat %a, bfloat %b) {
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<6>;
 ; SM70-NEXT:    .reg .b16 %rs<8>;
-; SM70-NEXT:    .reg .b32 %r<7>;
-; SM70-NEXT:    .reg .b32 %f<4>;
+; SM70-NEXT:    .reg .b32 %r<13>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b16 %rs1, [test_maximum_param_0];
 ; SM70-NEXT:    ld.param.b16 %rs2, [test_maximum_param_1];
 ; SM70-NEXT:    cvt.u32.u16 %r1, %rs2;
-; SM70-NEXT:    shl.b32 %r2, %r1, 16;
-; SM70-NEXT:    mov.b32 %f1, %r2;
-; SM70-NEXT:    cvt.u32.u16 %r3, %rs1;
-; SM70-NEXT:    shl.b32 %r4, %r3, 16;
-; SM70-NEXT:    mov.b32 %f2, %r4;
-; SM70-NEXT:    setp.gt.f32 %p1, %f2, %f1;
+; SM70-NEXT:    shl.b32 %r10, %r1, 16;
+; SM70-NEXT:    cvt.u32.u16 %r4, %rs1;
+; SM70-NEXT:    shl.b32 %r11, %r4, 16;
+; SM70-NEXT:    setp.gt.f32 %p1, %r11, %r10;
 ; SM70-NEXT:    selp.b16 %rs3, %rs1, %rs2, %p1;
-; SM70-NEXT:    setp.nan.f32 %p2, %f2, %f1;
+; SM70-NEXT:    setp.nan.f32 %p2, %r11, %r10;
 ; SM70-NEXT:    selp.b16 %rs4, 0x7FC0, %rs3, %p2;
 ; SM70-NEXT:    setp.eq.s16 %p3, %rs1, 0;
 ; SM70-NEXT:    selp.b16 %rs5, %rs1, %rs4, %p3;
 ; SM70-NEXT:    setp.eq.s16 %p4, %rs2, 0;
 ; SM70-NEXT:    selp.b16 %rs6, %rs2, %rs5, %p4;
-; SM70-NEXT:    cvt.u32.u16 %r5, %rs4;
-; SM70-NEXT:    shl.b32 %r6, %r5, 16;
-; SM70-NEXT:    mov.b32 %f3, %r6;
-; SM70-NEXT:    setp.eq.f32 %p5, %f3, 0f00000000;
+; SM70-NEXT:    cvt.u32.u16 %r7, %rs4;
+; SM70-NEXT:    shl.b32 %r12, %r7, 16;
+; SM70-NEXT:    setp.eq.f32 %p5, %r12, 0f00000000;
 ; SM70-NEXT:    selp.b16 %rs7, %rs6, %rs4, %p5;
 ; SM70-NEXT:    st.param.b16 [func_retval0], %rs7;
 ; SM70-NEXT:    ret;
@@ -1510,25 +1418,21 @@ define bfloat @test_maxnum(bfloat %a, bfloat %b) {
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<2>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<11>;
-; SM70-NEXT:    .reg .b32 %f<4>;
+; SM70-NEXT:    .reg .b32 %r<17>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b16 %r1, [test_maxnum_param_1];
-; SM70-NEXT:    shl.b32 %r2, %r1, 16;
-; SM70-NEXT:    mov.b32 %f1, %r2;
-; SM70-NEXT:    ld.param.b16 %r3, [test_maxnum_param_0];
-; SM70-NEXT:    shl.b32 %r4, %r3, 16;
-; SM70-NEXT:    mov.b32 %f2, %r4;
-; SM70-NEXT:    max.f32 %f3, %f2, %f1;
-; SM70-NEXT:    mov.b32 %r5, %f3;
-; SM70-NEXT:    bfe.u32 %r6, %r5, 16, 1;
-; SM70-NEXT:    add.s32 %r7, %r6, %r5;
-; SM70-NEXT:    add.s32 %r8, %r7, 32767;
-; SM70-NEXT:    setp.nan.f32 %p1, %f3, %f3;
-; SM70-NEXT:    or.b32 %r9, %r5, 4194304;
-; SM70-NEXT:    selp.b32 %r10, %r9, %r8, %p1;
-; SM70-NEXT:    mov.b32 {_, %rs1}, %r10;
+; SM70-NEXT:    shl.b32 %r14, %r1, 16;
+; SM70-NEXT:    ld.param.b16 %r4, [test_maxnum_param_0];
+; SM70-NEXT:    shl.b32 %r15, %r4, 16;
+; SM70-NEXT:    max.f32 %r16, %r15, %r14;
+; SM70-NEXT:    bfe.u32 %r9, %r16, 16, 1;
+; SM70-NEXT:    add.s32 %r10, %r9, %r16;
+; SM70-NEXT:    add.s32 %r11, %r10, 32767;
+; SM70-NEXT:    setp.nan.f32 %p1, %r16, %r16;
+; SM70-NEXT:    or.b32 %r12, %r16, 4194304;
+; SM70-NEXT:    selp.b32 %r13, %r12, %r11, %p1;
+; SM70-NEXT:    mov.b32 {_, %rs1}, %r13;
 ; SM70-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; SM70-NEXT:    ret;
 ;
@@ -1573,54 +1477,47 @@ define <2 x bfloat> @test_maximum_v2(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<11>;
 ; SM70-NEXT:    .reg .b16 %rs<15>;
-; SM70-NEXT:    .reg .b32 %r<16>;
-; SM70-NEXT:    .reg .b32 %f<7>;
+; SM70-NEXT:    .reg .b32 %r<28>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b32 %r1, [test_maximum_v2_param_0];
 ; SM70-NEXT:    ld.param.b32 %r2, [test_maximum_v2_param_1];
 ; SM70-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
 ; SM70-NEXT:    cvt.u32.u16 %r3, %rs2;
-; SM70-NEXT:    shl.b32 %r4, %r3, 16;
-; SM70-NEXT:    mov.b32 %f1, %r4;
+; SM70-NEXT:    shl.b32 %r22, %r3, 16;
 ; SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; SM70-NEXT:    cvt.u32.u16 %r5, %rs4;
-; SM70-NEXT:    shl.b32 %r6, %r5, 16;
-; SM70-NEXT:    mov.b32 %f2, %r6;
-; SM70-NEXT:    setp.gt.f32 %p1, %f2, %f1;
+; SM70-NEXT:    cvt.u32.u16 %r6, %rs4;
+; SM70-NEXT:    shl.b32 %r23, %r6, 16;
+; SM70-NEXT:    setp.gt.f32 %p1, %r23, %r22;
 ; SM70-NEXT:    selp.b16 %rs5, %rs4, %rs2, %p1;
-; SM70-NEXT:    setp.nan.f32 %p2, %f2, %f1;
+; SM70-NEXT:    setp.nan.f32 %p2, %r23, %r22;
 ; SM70-NEXT:    selp.b16 %rs6, 0x7FC0, %rs5, %p2;
 ; SM70-NEXT:    setp.eq.s16 %p3, %rs4, 0;
 ; SM70-NEXT:    selp.b16 %rs7, %rs4, %rs6, %p3;
 ; SM70-NEXT:    setp.eq.s16 %p4, %rs2, 0;
 ; SM70-NEXT:    selp.b16 %rs8, %rs2, %rs7, %p4;
-; SM70-NEXT:    cvt.u32.u16 %r7, %rs6;
-; SM70-NEXT:    shl.b32 %r8, %r7, 16;
-; SM70-NEXT:    mov.b32 %f3, %r8;
-; SM70-NEXT:    setp.eq.f32 %p5, %f3, 0f00000000;
+; SM70-NEXT:    cvt.u32.u16 %r9, %rs6;
+; SM70-NEXT:    shl.b32 %r24, %r9, 16;
+; SM70-NEXT:    setp.eq.f32 %p5, %r24, 0f00000000;
 ; SM70-NEXT:    selp.b16 %rs9, %rs8, %rs6, %p5;
-; SM70-NEXT:    cvt.u32.u16 %r9, %rs1;
-; SM70-NEXT:    shl.b32 %r10, %r9, 16;
-; SM70-NEXT:    mov.b32 %f4, %r10;
-; SM70-NEXT:    cvt.u32.u16 %r11, %rs3;
-; SM70-NEXT:    shl.b32 %r12, %r11, 16;
-; SM70-NEXT:    mov.b32 %f5, %r12;
-; SM70-NEXT:    setp.gt.f32 %p6, %f5, %f4;
+; SM70-NEXT:    cvt.u32.u16 %r12, %rs1;
+; SM70-NEXT:    shl.b32 %r25, %r12, 16;
+; SM70-NEXT:    cvt.u32.u16 %r15, %rs3;
+; SM70-NEXT:    shl.b32 %r26, %r15, 16;
+; SM70-NEXT:    setp.gt.f32 %p6, %r26, %r25;
 ; SM70-NEXT:    selp.b16 %rs10, %rs3, %rs1, %p6;
-; SM70-NEXT:    setp.nan.f32 %p7, %f5, %f4;
+; SM70-NEXT:    setp.nan.f32 %p7, %r26, %r25;
 ; SM70-NEXT:    selp.b16 %rs11, 0x7FC0, %rs10, %p7;
 ; SM70-NEXT:    setp.eq.s16 %p8, %rs3, 0;
 ; SM70-NEXT:    selp.b16 %rs12, %rs3, %rs11, %p8;
 ; SM70-NEXT:    setp.eq.s16 %p9, %rs1, 0;
 ; SM70-NEXT:    selp.b16 %rs13, %rs1, %rs12, %p9;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs11;
-; SM70-NEXT:    shl.b32 %r14, %r13, 16;
-; SM70-NEXT:    mov.b32 %f6, %r14;
-; SM70-NEXT:    setp.eq.f32 %p10, %f6, 0f00000000;
+; SM70-NEXT:    cvt.u32.u16 %r18, %rs11;
+; SM70-NEXT:    shl.b32 %r27, %r18, 16;
+; SM70-NEXT:    setp.eq.f32 %p10, %r27, 0f00000000;
 ; SM70-NEXT:    selp.b16 %rs14, %rs13, %rs11, %p10;
-; SM70-NEXT:    mov.b32 %r15, {%rs14, %rs9};
-; SM70-NEXT:    st.param.b32 [func_retval0], %r15;
+; SM70-NEXT:    mov.b32 %r21, {%rs14, %rs9};
+; SM70-NEXT:    st.param.b32 [func_retval0], %r21;
 ; SM70-NEXT:    ret;
 ;
 ; SM80-LABEL: test_maximum_v2(
@@ -1664,44 +1561,37 @@ define <2 x bfloat> @test_maxnum_v2(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<5>;
-; SM70-NEXT:    .reg .b32 %r<24>;
-; SM70-NEXT:    .reg .b32 %f<7>;
+; SM70-NEXT:    .reg .b32 %r<36>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b32 %r1, [test_maxnum_v2_param_0];
 ; SM70-NEXT:    ld.param.b32 %r2, [test_maxnum_v2_param_1];
 ; SM70-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
 ; SM70-NEXT:    cvt.u32.u16 %r3, %rs2;
-; SM70-NEXT:    shl.b32 %r4, %r3, 16;
-; SM70-NEXT:    mov.b32 %f1, %r4;
+; SM70-NEXT:    shl.b32 %r30, %r3, 16;
 ; SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; SM70-NEXT:    cvt.u32.u16 %r5, %rs4;
-; SM70-NEXT:    shl.b32 %r6, %r5, 16;
-; SM70-NEXT:    mov.b32 %f2, %r6;
-; SM70-NEXT:    max.f32 %f3, %f2, %f1;
-; SM70-NEXT:    mov.b32 %r7, %f3;
-; SM70-NEXT:    bfe.u32 %r8, %r7, 16, 1;
-; SM70-NEXT:    add.s32 %r9, %r8, %r7;
-; SM70-NEXT:    add.s32 %r10, %r9, 32767;
-; SM70-NEXT:    setp.nan.f32 %p1, %f3, %f3;
-; SM70-NEXT:    or.b32 %r11, %r7, 4194304;
-; SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    shl.b32 %r14, %r13, 16;
-; SM70-NEXT:    mov.b32 %f4, %r14;
-; SM70-NEXT:    cvt.u32.u16 %r15, %rs3;
-; SM70-NEXT:    shl.b32 %r16, %r15, 16;
-; SM70-NEXT:    mov.b32 %f5, %r16;
-; SM70-NEXT:    max.f32 %f6, %f5, %f4;
-; SM70-NEXT:    mov.b32 %r17, %f6;
-; SM70-NEXT:    bfe.u32 %r18, %r17, 16, 1;
-; SM70-NEXT:    add.s32 %r19, %r18, %r17;
-; SM70-NEXT:    add.s32 %r20, %r19, 32767;
-; SM70-NEXT:    setp.nan.f32 %p2, %f6, %f6;
-; SM70-NEXT:    or.b32 %r21, %r17, 4194304;
-; SM70-NEXT:    selp.b32 %r22, %r21, %r20, %p2;
-; SM70-NEXT:    prmt.b32 %r23, %r22, %r12, 0x7632U;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r23;
+; SM70-NEXT:    cvt.u32.u16 %r6, %rs4;
+; SM70-NEXT:    shl.b32 %r31, %r6, 16;
+; SM70-NEXT:    max.f32 %r32, %r31, %r30;
+; SM70-NEXT:    bfe.u32 %r11, %r32, 16, 1;
+; SM70-NEXT:    add.s32 %r12, %r11, %r32;
+; SM70-NEXT:    add.s32 %r13, %r12, 32767;
+; SM70-NEXT:    setp.nan.f32 %p1, %r32, %r32;
+; SM70-NEXT:    or.b32 %r14, %r32, 4194304;
+; SM70-NEXT:    selp.b32 %r15, %r14, %r13, %p1;
+; SM70-NEXT:    cvt.u32.u16 %r16, %rs1;
+; SM70-NEXT:    shl.b32 %r33, %r16, 16;
+; SM70-NEXT:    cvt.u32.u16 %r19, %rs3;
+; SM70-NEXT:    shl.b32 %r34, %r19, 16;
+; SM70-NEXT:    max.f32 %r35, %r34, %r33;
+; SM70-NEXT:    bfe.u32 %r24, %r35, 16, 1;
+; SM70-NEXT:    add.s32 %r25, %r24, %r35;
+; SM70-NEXT:    add.s32 %r26, %r25, 32767;
+; SM70-NEXT:    setp.nan.f32 %p2, %r35, %r35;
+; SM70-NEXT:    or.b32 %r27, %r35, 4194304;
+; SM70-NEXT:    selp.b32 %r28, %r27, %r26, %p2;
+; SM70-NEXT:    prmt.b32 %r29, %r28, %r15, 0x7632U;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r29;
 ; SM70-NEXT:    ret;
 ;
 ; SM80-LABEL: test_maxnum_v2(
diff --git a/llvm/test/CodeGen/NVPTX/bf16x2-instructions-approx.ll b/llvm/test/CodeGen/NVPTX/bf16x2-instructions-approx.ll
index 5ab684adac58e..0e90b254225eb 100644
--- a/llvm/test/CodeGen/NVPTX/bf16x2-instructions-approx.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16x2-instructions-approx.ll
@@ -11,18 +11,17 @@ define <2 x bfloat> @test_sin(<2 x bfloat> %a) #0 #1 {
 ; CHECK-LABEL: test_sin(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<3>;
-; CHECK-NEXT:    .reg .b32 %r<3>;
-; CHECK-NEXT:    .reg .b32 %f<5>;
+; CHECK-NEXT:    .reg .b32 %r<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_sin_param_0];
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
-; CHECK-NEXT:    cvt.f32.bf16 %f1, %rs1;
-; CHECK-NEXT:    sin.approx.f32 %f2, %f1;
-; CHECK-NEXT:    cvt.f32.bf16 %f3, %rs2;
-; CHECK-NEXT:    sin.approx.f32 %f4, %f3;
-; CHECK-NEXT:    cvt.rn.bf16x2.f32 %r2, %f4, %f2;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    cvt.f32.bf16 %r2, %rs1;
+; CHECK-NEXT:    sin.approx.f32 %r3, %r2;
+; CHECK-NEXT:    cvt.f32.bf16 %r4, %rs2;
+; CHECK-NEXT:    sin.approx.f32 %r5, %r4;
+; CHECK-NEXT:    cvt.rn.bf16x2.f32 %r6, %r5, %r3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
 ; CHECK-NEXT:    ret;
   %r = call <2 x bfloat> @llvm.sin.f16(<2 x bfloat> %a)
   ret <2 x bfloat> %r
@@ -32,18 +31,17 @@ define <2 x bfloat> @test_cos(<2 x bfloat> %a) #0 #1 {
 ; CHECK-LABEL: test_cos(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<3>;
-; CHECK-NEXT:    .reg .b32 %r<3>;
-; CHECK-NEXT:    .reg .b32 %f<5>;
+; CHECK-NEXT:    .reg .b32 %r<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_cos_param_0];
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
-; CHECK-NEXT:    cvt.f32.bf16 %f1, %rs1;
-; CHECK-NEXT:    cos.approx.f32 %f2, %f1;
-; CHECK-NEXT:    cvt.f32.bf16 %f3, %rs2;
-; CHECK-NEXT:    cos.approx.f32 %f4, %f3;
-; CHECK-NEXT:    cvt.rn.bf16x2.f32 %r2, %f4, %f2;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    cvt.f32.bf16 %r2, %rs1;
+; CHECK-NEXT:    cos.approx.f32 %r3, %r2;
+; CHECK-NEXT:    cvt.f32.bf16 %r4, %rs2;
+; CHECK-NEXT:    cos.approx.f32 %r5, %r4;
+; CHECK-NEXT:    cvt.rn.bf16x2.f32 %r6, %r5, %r3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
 ; CHECK-NEXT:    ret;
   %r = call <2 x bfloat> @llvm.cos.f16(<2 x bfloat> %a)
   ret <2 x bfloat> %r
diff --git a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
index cd73b78eff97c..3c91fbc9cde56 100644
--- a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
@@ -130,22 +130,21 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fdiv(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<5>;
-; CHECK-NEXT:    .reg .b32 %r<4>;
-; CHECK-NEXT:    .reg .b32 %f<7>;
+; CHECK-NEXT:    .reg .b32 %r<10>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_fdiv_param_0];
 ; CHECK-NEXT:    ld.param.b32 %r2, [test_fdiv_param_1];
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NEXT:    cvt.f32.bf16 %f1, %rs1;
+; CHECK-NEXT:    cvt.f32.bf16 %r3, %rs1;
 ; CHECK-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NEXT:    cvt.f32.bf16 %f2, %rs3;
-; CHECK-NEXT:    div.rn.f32 %f3, %f2, %f1;
-; CHECK-NEXT:    cvt.f32.bf16 %f4, %rs2;
-; CHECK-NEXT:    cvt.f32.bf16 %f5, %rs4;
-; CHECK-NEXT:    div.rn.f32 %f6, %f5, %f4;
-; CHECK-NEXT:    cvt.rn.bf16x2.f32 %r3, %f6, %f3;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    cvt.f32.bf16 %r4, %rs3;
+; CHECK-NEXT:    div.rn.f32 %r5, %r4, %r3;
+; CHECK-NEXT:    cvt.f32.bf16 %r6, %rs2;
+; CHECK-NEXT:    cvt.f32.bf16 %r7, %rs4;
+; CHECK-NEXT:    div.rn.f32 %r8, %r7, %r6;
+; CHECK-NEXT:    cvt.rn.bf16x2.f32 %r9, %r8, %r5;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r9;
 ; CHECK-NEXT:    ret;
   %r = fdiv <2 x bfloat> %a, %b
   ret <2 x bfloat> %r
@@ -258,8 +257,7 @@ define <2 x bfloat> @test_select_cc(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloa
 ; SM80:       {
 ; SM80-NEXT:    .reg .pred %p<3>;
 ; SM80-NEXT:    .reg .b16 %rs<11>;
-; SM80-NEXT:    .reg .b32 %r<6>;
-; SM80-NEXT:    .reg .b32 %f<5>;
+; SM80-NEXT:    .reg .b32 %r<10>;
 ; SM80-EMPTY:
 ; SM80-NEXT:  // %bb.0:
 ; SM80-NEXT:    ld.param.b32 %r1, [test_select_cc_param_0];
@@ -267,19 +265,19 @@ define <2 x bfloat> @test_select_cc(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloa
 ; SM80-NEXT:    ld.param.b32 %r3, [test_select_cc_param_2];
 ; SM80-NEXT:    ld.param.b32 %r4, [test_select_cc_param_3];
 ; SM80-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
-; SM80-NEXT:    cvt.f32.bf16 %f1, %rs1;
+; SM80-NEXT:    cvt.f32.bf16 %r5, %rs1;
 ; SM80-NEXT:    mov.b32 {%rs3, %rs4}, %r3;
-; SM80-NEXT:    cvt.f32.bf16 %f2, %rs3;
-; SM80-NEXT:    setp.neu.f32 %p1, %f2, %f1;
-; SM80-NEXT:    cvt.f32.bf16 %f3, %rs2;
-; SM80-NEXT:    cvt.f32.bf16 %f4, %rs4;
-; SM80-NEXT:    setp.neu.f32 %p2, %f4, %f3;
+; SM80-NEXT:    cvt.f32.bf16 %r6, %rs3;
+; SM80-NEXT:    setp.neu.f32 %p1, %r6, %r5;
+; SM80-NEXT:    cvt.f32.bf16 %r7, %rs2;
+; SM80-NEXT:    cvt.f32.bf16 %r8, %rs4;
+; SM80-NEXT:    setp.neu.f32 %p2, %r8, %r7;
 ; SM80-NEXT:    mov.b32 {%rs5, %rs6}, %r2;
 ; SM80-NEXT:    mov.b32 {%rs7, %rs8}, %r1;
 ; SM80-NEXT:    selp.b16 %rs9, %rs8, %rs6, %p2;
 ; SM80-NEXT:    selp.b16 %rs10, %rs7, %rs5, %p1;
-; SM80-NEXT:    mov.b32 %r5, {%rs10, %rs9};
-; SM80-NEXT:    st.param.b32 [func_retval0], %r5;
+; SM80-NEXT:    mov.b32 %r9, {%rs10, %rs9};
+; SM80-NEXT:    st.param.b32 [func_retval0], %r9;
 ; SM80-NEXT:    ret;
 ;
 ; SM90-LABEL: test_select_cc(
@@ -311,42 +309,40 @@ define <2 x float> @test_select_cc_f32_bf16(<2 x float> %a, <2 x float> %b,
 ; SM80:       {
 ; SM80-NEXT:    .reg .pred %p<3>;
 ; SM80-NEXT:    .reg .b16 %rs<5>;
-; SM80-NEXT:    .reg .b32 %r<3>;
-; SM80-NEXT:    .reg .b32 %f<11>;
+; SM80-NEXT:    .reg .b32 %r<13>;
 ; SM80-EMPTY:
 ; SM80-NEXT:  // %bb.0:
-; SM80-NEXT:    ld.param.v2.b32 {%f1, %f2}, [test_select_cc_f32_bf16_param_0];
-; SM80-NEXT:    ld.param.b32 %r1, [test_select_cc_f32_bf16_param_2];
-; SM80-NEXT:    ld.param.b32 %r2, [test_select_cc_f32_bf16_param_3];
-; SM80-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; SM80-NEXT:    cvt.f32.bf16 %f3, %rs1;
-; SM80-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; SM80-NEXT:    cvt.f32.bf16 %f4, %rs3;
-; SM80-NEXT:    setp.neu.f32 %p1, %f4, %f3;
-; SM80-NEXT:    cvt.f32.bf16 %f5, %rs2;
-; SM80-NEXT:    cvt.f32.bf16 %f6, %rs4;
-; SM80-NEXT:    setp.neu.f32 %p2, %f6, %f5;
-; SM80-NEXT:    ld.param.v2.b32 {%f7, %f8}, [test_select_cc_f32_bf16_param_1];
-; SM80-NEXT:    selp.f32 %f9, %f2, %f8, %p2;
-; SM80-NEXT:    selp.f32 %f10, %f1, %f7, %p1;
-; SM80-NEXT:    st.param.v2.b32 [func_retval0], {%f10, %f9};
+; SM80-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_select_cc_f32_bf16_param_0];
+; SM80-NEXT:    ld.param.b32 %r3, [test_select_cc_f32_bf16_param_2];
+; SM80-NEXT:    ld.param.b32 %r4, [test_select_cc_f32_bf16_param_3];
+; SM80-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
+; SM80-NEXT:    cvt.f32.bf16 %r5, %rs1;
+; SM80-NEXT:    mov.b32 {%rs3, %rs4}, %r3;
+; SM80-NEXT:    cvt.f32.bf16 %r6, %rs3;
+; SM80-NEXT:    setp.neu.f32 %p1, %r6, %r5;
+; SM80-NEXT:    cvt.f32.bf16 %r7, %rs2;
+; SM80-NEXT:    cvt.f32.bf16 %r8, %rs4;
+; SM80-NEXT:    setp.neu.f32 %p2, %r8, %r7;
+; SM80-NEXT:    ld.param.v2.b32 {%r9, %r10}, [test_select_cc_f32_bf16_param_1];
+; SM80-NEXT:    selp.f32 %r11, %r2, %r10, %p2;
+; SM80-NEXT:    selp.f32 %r12, %r1, %r9, %p1;
+; SM80-NEXT:    st.param.v2.b32 [func_retval0], {%r12, %r11};
 ; SM80-NEXT:    ret;
 ;
 ; SM90-LABEL: test_select_cc_f32_bf16(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b32 %r<3>;
-; SM90-NEXT:    .reg .b32 %f<7>;
+; SM90-NEXT:    .reg .b32 %r<9>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.v2.b32 {%f1, %f2}, [test_select_cc_f32_bf16_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [test_select_cc_f32_bf16_param_3];
-; SM90-NEXT:    ld.param.b32 %r2, [test_select_cc_f32_bf16_param_2];
-; SM90-NEXT:    setp.neu.bf16x2 %p1|%p2, %r2, %r1;
-; SM90-NEXT:    ld.param.v2.b32 {%f3, %f4}, [test_select_cc_f32_bf16_param_1];
-; SM90-NEXT:    selp.f32 %f5, %f2, %f4, %p2;
-; SM90-NEXT:    selp.f32 %f6, %f1, %f3, %p1;
-; SM90-NEXT:    st.param.v2.b32 [func_retval0], {%f6, %f5};
+; SM90-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_select_cc_f32_bf16_param_0];
+; SM90-NEXT:    ld.param.b32 %r3, [test_select_cc_f32_bf16_param_3];
+; SM90-NEXT:    ld.param.b32 %r4, [test_select_cc_f32_bf16_param_2];
+; SM90-NEXT:    setp.neu.bf16x2 %p1|%p2, %r4, %r3;
+; SM90-NEXT:    ld.param.v2.b32 {%r5, %r6}, [test_select_cc_f32_bf16_param_1];
+; SM90-NEXT:    selp.f32 %r7, %r2, %r6, %p2;
+; SM90-NEXT:    selp.f32 %r8, %r1, %r5, %p1;
+; SM90-NEXT:    st.param.v2.b32 [func_retval0], {%r8, %r7};
 ; SM90-NEXT:    ret;
                                            <2 x bfloat> %c, <2 x bfloat> %d) #0 {
   %cc = fcmp une <2 x bfloat> %c, %d
@@ -359,22 +355,21 @@ define <2 x bfloat> @test_select_cc_bf16_f32(<2 x bfloat> %a, <2 x bfloat> %b,
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<3>;
 ; CHECK-NEXT:    .reg .b16 %rs<7>;
-; CHECK-NEXT:    .reg .b32 %r<4>;
-; CHECK-NEXT:    .reg .b32 %f<5>;
+; CHECK-NEXT:    .reg .b32 %r<8>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_select_cc_bf16_f32_param_0];
 ; CHECK-NEXT:    ld.param.b32 %r2, [test_select_cc_bf16_f32_param_1];
-; CHECK-NEXT:    ld.param.v2.b32 {%f1, %f2}, [test_select_cc_bf16_f32_param_2];
-; CHECK-NEXT:    ld.param.v2.b32 {%f3, %f4}, [test_select_cc_bf16_f32_param_3];
-; CHECK-NEXT:    setp.neu.f32 %p1, %f1, %f3;
-; CHECK-NEXT:    setp.neu.f32 %p2, %f2, %f4;
+; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_select_cc_bf16_f32_param_2];
+; CHECK-NEXT:    ld.param.v2.b32 {%r5, %r6}, [test_select_cc_bf16_f32_param_3];
+; CHECK-NEXT:    setp.neu.f32 %p1, %r3, %r5;
+; CHECK-NEXT:    setp.neu.f32 %p2, %r4, %r6;
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
 ; CHECK-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
 ; CHECK-NEXT:    selp.b16 %rs5, %rs4, %rs2, %p2;
 ; CHECK-NEXT:    selp.b16 %rs6, %rs3, %rs1, %p1;
-; CHECK-NEXT:    mov.b32 %r3, {%rs6, %rs5};
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    mov.b32 %r7, {%rs6, %rs5};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r7;
 ; CHECK-NEXT:    ret;
                                           <2 x float> %c, <2 x float> %d) #0 {
   %cc = fcmp une <2 x float> %c, %d
@@ -385,13 +380,12 @@ define <2 x bfloat> @test_select_cc_bf16_f32(<2 x bfloat> %a, <2 x bfloat> %b,
 define <2 x bfloat> @test_fptrunc_2xfloat(<2 x float> %a) #0 {
 ; CHECK-LABEL: test_fptrunc_2xfloat(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%f1, %f2}, [test_fptrunc_2xfloat_param_0];
-; CHECK-NEXT:    cvt.rn.bf16x2.f32 %r1, %f2, %f1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fptrunc_2xfloat_param_0];
+; CHECK-NEXT:    cvt.rn.bf16x2.f32 %r3, %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
   %r = fptrunc <2 x float> %a to <2 x bfloat>
   ret <2 x bfloat> %r
@@ -401,15 +395,14 @@ define <2 x float> @test_fpext_2xfloat(<2 x bfloat> %a) #0 {
 ; CHECK-LABEL: test_fpext_2xfloat(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<3>;
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_fpext_2xfloat_param_0];
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
-; CHECK-NEXT:    cvt.f32.bf16 %f1, %rs2;
-; CHECK-NEXT:    cvt.f32.bf16 %f2, %rs1;
-; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%f2, %f1};
+; CHECK-NEXT:    cvt.f32.bf16 %r2, %rs2;
+; CHECK-NEXT:    cvt.f32.bf16 %r3, %rs1;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r3, %r2};
 ; CHECK-NEXT:    ret;
   %r = fpext <2 x bfloat> %a to <2 x float>
   ret <2 x float> %r
@@ -468,18 +461,17 @@ define <2 x bfloat> @test_sqrt(<2 x bfloat> %a) #0 {
 ; CHECK-LABEL: test_sqrt(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<3>;
-; CHECK-NEXT:    .reg .b32 %r<3>;
-; CHECK-NEXT:    .reg .b32 %f<5>;
+; CHECK-NEXT:    .reg .b32 %r<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_sqrt_param_0];
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
-; CHECK-NEXT:    cvt.f32.bf16 %f1, %rs1;
-; CHECK-NEXT:    sqrt.rn.f32 %f2, %f1;
-; CHECK-NEXT:    cvt.f32.bf16 %f3, %rs2;
-; CHECK-NEXT:    sqrt.rn.f32 %f4, %f3;
-; CHECK-NEXT:    cvt.rn.bf16x2.f32 %r2, %f4, %f2;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    cvt.f32.bf16 %r2, %rs1;
+; CHECK-NEXT:    sqrt.rn.f32 %r3, %r2;
+; CHECK-NEXT:    cvt.f32.bf16 %r4, %rs2;
+; CHECK-NEXT:    sqrt.rn.f32 %r5, %r4;
+; CHECK-NEXT:    cvt.rn.bf16x2.f32 %r6, %r5, %r3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
 ; CHECK-NEXT:    ret;
   %r = call <2 x bfloat> @llvm.sqrt.f16(<2 x bfloat> %a)
   ret <2 x bfloat> %r
@@ -582,18 +574,17 @@ define <2 x bfloat> @test_floor(<2 x bfloat> %a) #0 {
 ; SM80-LABEL: test_floor(
 ; SM80:       {
 ; SM80-NEXT:    .reg .b16 %rs<3>;
-; SM80-NEXT:    .reg .b32 %r<3>;
-; SM80-NEXT:    .reg .b32 %f<5>;
+; SM80-NEXT:    .reg .b32 %r<7>;
 ; SM80-EMPTY:
 ; SM80-NEXT:  // %bb.0:
 ; SM80-NEXT:    ld.param.b32 %r1, [test_floor_param_0];
 ; SM80-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
-; SM80-NEXT:    cvt.f32.bf16 %f1, %rs1;
-; SM80-NEXT:    cvt.rmi.f32.f32 %f2, %f1;
-; SM80-NEXT:    cvt.f32.bf16 %f3, %rs2;
-; SM80-NEXT:    cvt.rmi.f32.f32 %f4, %f3;
-; SM80-NEXT:    cvt.rn.bf16x2.f32 %r2, %f4, %f2;
-; SM80-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM80-NEXT:    cvt.f32.bf16 %r2, %rs1;
+; SM80-NEXT:    cvt.rmi.f32.f32 %r3, %r2;
+; SM80-NEXT:    cvt.f32.bf16 %r4, %rs2;
+; SM80-NEXT:    cvt.rmi.f32.f32 %r5, %r4;
+; SM80-NEXT:    cvt.rn.bf16x2.f32 %r6, %r5, %r3;
+; SM80-NEXT:    st.param.b32 [func_retval0], %r6;
 ; SM80-NEXT:    ret;
 ;
 ; SM90-LABEL: test_floor(
@@ -617,18 +608,17 @@ define <2 x bfloat> @test_ceil(<2 x bfloat> %a) #0 {
 ; SM80-LABEL: test_ceil(
 ; SM80:       {
 ; SM80-NEXT:    .reg .b16 %rs<3>;
-; SM80-NEXT:    .reg .b32 %r<3>;
-; SM80-NEXT:    .reg .b32 %f<5>;
+; SM80-NEXT:    .reg .b32 %r<7>;
 ; SM80-EMPTY:
 ; SM80-NEXT:  // %bb.0:
 ; SM80-NEXT:    ld.param.b32 %r1, [test_ceil_param_0];
 ; SM80-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
-; SM80-NEXT:    cvt.f32.bf16 %f1, %rs1;
-; SM80-NEXT:    cvt.rpi.f32.f32 %f2, %f1;
-; SM80-NEXT:    cvt.f32.bf16 %f3, %rs2;
-; SM80-NEXT:    cvt.rpi.f32.f32 %f4, %f3;
-; SM80-NEXT:    cvt.rn.bf16x2.f32 %r2, %f4, %f2;
-; SM80-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM80-NEXT:    cvt.f32.bf16 %r2, %rs1;
+; SM80-NEXT:    cvt.rpi.f32.f32 %r3, %r2;
+; SM80-NEXT:    cvt.f32.bf16 %r4, %rs2;
+; SM80-NEXT:    cvt.rpi.f32.f32 %r5, %r4;
+; SM80-NEXT:    cvt.rn.bf16x2.f32 %r6, %r5, %r3;
+; SM80-NEXT:    st.param.b32 [func_retval0], %r6;
 ; SM80-NEXT:    ret;
 ;
 ; SM90-LABEL: test_ceil(
@@ -652,18 +642,17 @@ define <2 x bfloat> @test_trunc(<2 x bfloat> %a) #0 {
 ; SM80-LABEL: test_trunc(
 ; SM80:       {
 ; SM80-NEXT:    .reg .b16 %rs<3>;
-; SM80-NEXT:    .reg .b32 %r<3>;
-; SM80-NEXT:    .reg .b32 %f<5>;
+; SM80-NEXT:    .reg .b32 %r<7>;
 ; SM80-EMPTY:
 ; SM80-NEXT:  // %bb.0:
 ; SM80-NEXT:    ld.param.b32 %r1, [test_trunc_param_0];
 ; SM80-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
-; SM80-NEXT:    cvt.f32.bf16 %f1, %rs1;
-; SM80-NEXT:    cvt.rzi.f32.f32 %f2, %f1;
-; SM80-NEXT:    cvt.f32.bf16 %f3, %rs2;
-; SM80-NEXT:    cvt.rzi.f32.f32 %f4, %f3;
-; SM80-NEXT:    cvt.rn.bf16x2.f32 %r2, %f4, %f2;
-; SM80-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM80-NEXT:    cvt.f32.bf16 %r2, %rs1;
+; SM80-NEXT:    cvt.rzi.f32.f32 %r3, %r2;
+; SM80-NEXT:    cvt.f32.bf16 %r4, %rs2;
+; SM80-NEXT:    cvt.rzi.f32.f32 %r5, %r4;
+; SM80-NEXT:    cvt.rn.bf16x2.f32 %r6, %r5, %r3;
+; SM80-NEXT:    st.param.b32 [func_retval0], %r6;
 ; SM80-NEXT:    ret;
 ;
 ; SM90-LABEL: test_trunc(
@@ -687,18 +676,17 @@ define <2 x bfloat> @test_rint(<2 x bfloat> %a) #0 {
 ; SM80-LABEL: test_rint(
 ; SM80:       {
 ; SM80-NEXT:    .reg .b16 %rs<3>;
-; SM80-NEXT:    .reg .b32 %r<3>;
-; SM80-NEXT:    .reg .b32 %f<5>;
+; SM80-NEXT:    .reg .b32 %r<7>;
 ; SM80-EMPTY:
 ; SM80-NEXT:  // %bb.0:
 ; SM80-NEXT:    ld.param.b32 %r1, [test_rint_param_0];
 ; SM80-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
-; SM80-NEXT:    cvt.f32.bf16 %f1, %rs1;
-; SM80-NEXT:    cvt.rni.f32.f32 %f2, %f1;
-; SM80-NEXT:    cvt.f32.bf16 %f3, %rs2;
-; SM80-NEXT:    cvt.rni.f32.f32 %f4, %f3;
-; SM80-NEXT:    cvt.rn.bf16x2.f32 %r2, %f4, %f2;
-; SM80-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM80-NEXT:    cvt.f32.bf16 %r2, %rs1;
+; SM80-NEXT:    cvt.rni.f32.f32 %r3, %r2;
+; SM80-NEXT:    cvt.f32.bf16 %r4, %rs2;
+; SM80-NEXT:    cvt.rni.f32.f32 %r5, %r4;
+; SM80-NEXT:    cvt.rn.bf16x2.f32 %r6, %r5, %r3;
+; SM80-NEXT:    st.param.b32 [func_retval0], %r6;
 ; SM80-NEXT:    ret;
 ;
 ; SM90-LABEL: test_rint(
@@ -723,40 +711,35 @@ define <2 x bfloat> @test_round(<2 x bfloat> %a) #0 {
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<5>;
 ; CHECK-NEXT:    .reg .b16 %rs<3>;
-; CHECK-NEXT:    .reg .b32 %r<9>;
-; CHECK-NEXT:    .reg .b32 %f<17>;
+; CHECK-NEXT:    .reg .b32 %r<29>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_round_param_0];
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
-; CHECK-NEXT:    cvt.f32.bf16 %f1, %rs1;
-; CHECK-NEXT:    mov.b32 %r2, %f1;
-; CHECK-NEXT:    and.b32 %r3, %r2, -2147483648;
-; CHECK-NEXT:    or.b32 %r4, %r3, 1056964608;
-; CHECK-NEXT:    mov.b32 %f2, %r4;
-; CHECK-NEXT:    add.rn.f32 %f3, %f1, %f2;
-; CHECK-NEXT:    cvt.rzi.f32.f32 %f4, %f3;
-; CHECK-NEXT:    abs.f32 %f5, %f1;
-; CHECK-NEXT:    setp.gt.f32 %p1, %f5, 0f4B000000;
-; CHECK-NEXT:    selp.f32 %f6, %f1, %f4, %p1;
-; CHECK-NEXT:    cvt.rzi.f32.f32 %f7, %f1;
-; CHECK-NEXT:    setp.lt.f32 %p2, %f5, 0f3F000000;
-; CHECK-NEXT:    selp.f32 %f8, %f7, %f6, %p2;
-; CHECK-NEXT:    cvt.f32.bf16 %f9, %rs2;
-; CHECK-NEXT:    mov.b32 %r5, %f9;
-; CHECK-NEXT:    and.b32 %r6, %r5, -2147483648;
-; CHECK-NEXT:    or.b32 %r7, %r6, 1056964608;
-; CHECK-NEXT:    mov.b32 %f10, %r7;
-; CHECK-NEXT:    add.rn.f32 %f11, %f9, %f10;
-; CHECK-NEXT:    cvt.rzi.f32.f32 %f12, %f11;
-; CHECK-NEXT:    abs.f32 %f13, %f9;
-; CHECK-NEXT:    setp.gt.f32 %p3, %f13, 0f4B000000;
-; CHECK-NEXT:    selp.f32 %f14, %f9, %f12, %p3;
-; CHECK-NEXT:    cvt.rzi.f32.f32 %f15, %f9;
-; CHECK-NEXT:    setp.lt.f32 %p4, %f13, 0f3F000000;
-; CHECK-NEXT:    selp.f32 %f16, %f15, %f14, %p4;
-; CHECK-NEXT:    cvt.rn.bf16x2.f32 %r8, %f16, %f8;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r8;
+; CHECK-NEXT:    cvt.f32.bf16 %r25, %rs1;
+; CHECK-NEXT:    and.b32 %r4, %r25, -2147483648;
+; CHECK-NEXT:    or.b32 %r26, %r4, 1056964608;
+; CHECK-NEXT:    add.rn.f32 %r7, %r25, %r26;
+; CHECK-NEXT:    cvt.rzi.f32.f32 %r8, %r7;
+; CHECK-NEXT:    abs.f32 %r9, %r25;
+; CHECK-NEXT:    setp.gt.f32 %p1, %r9, 0f4B000000;
+; CHECK-NEXT:    selp.f32 %r10, %r25, %r8, %p1;
+; CHECK-NEXT:    cvt.rzi.f32.f32 %r11, %r25;
+; CHECK-NEXT:    setp.lt.f32 %p2, %r9, 0f3F000000;
+; CHECK-NEXT:    selp.f32 %r12, %r11, %r10, %p2;
+; CHECK-NEXT:    cvt.f32.bf16 %r27, %rs2;
+; CHECK-NEXT:    and.b32 %r15, %r27, -2147483648;
+; CHECK-NEXT:    or.b32 %r28, %r15, 1056964608;
+; CHECK-NEXT:    add.rn.f32 %r18, %r27, %r28;
+; CHECK-NEXT:    cvt.rzi.f32.f32 %r19, %r18;
+; CHECK-NEXT:    abs.f32 %r20, %r27;
+; CHECK-NEXT:    setp.gt.f32 %p3, %r20, 0f4B000000;
+; CHECK-NEXT:    selp.f32 %r21, %r27, %r19, %p3;
+; CHECK-NEXT:    cvt.rzi.f32.f32 %r22, %r27;
+; CHECK-NEXT:    setp.lt.f32 %p4, %r20, 0f3F000000;
+; CHECK-NEXT:    selp.f32 %r23, %r22, %r21, %p4;
+; CHECK-NEXT:    cvt.rn.bf16x2.f32 %r24, %r23, %r12;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r24;
 ; CHECK-NEXT:    ret;
   %r = call <2 x bfloat> @llvm.round.f16(<2 x bfloat> %a)
   ret <2 x bfloat> %r
diff --git a/llvm/test/CodeGen/NVPTX/bug22322.ll b/llvm/test/CodeGen/NVPTX/bug22322.ll
index ace31667184b0..055c512401b4c 100644
--- a/llvm/test/CodeGen/NVPTX/bug22322.ll
+++ b/llvm/test/CodeGen/NVPTX/bug22322.ll
@@ -22,7 +22,7 @@ _ZL11compute_vecRK6float3jb.exit:
   %7 = icmp eq i32 %6, 0
   %8 = select i1 %7, float 0.000000e+00, float -1.000000e+00
   store float %8, ptr %ret_vec.sroa.8.i, align 4
-; CHECK: max.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, 0f00000000
+; CHECK: max.f32 %r{{[0-9]+}}, %r{{[0-9]+}}, 0f00000000
   %9 = fcmp olt float %8, 0.000000e+00
   %ret_vec.sroa.8.i.val = load float, ptr %ret_vec.sroa.8.i, align 4
   %10 = select i1 %9, float 0.000000e+00, float %ret_vec.sroa.8.i.val
diff --git a/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll b/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
index c4a62f9f8c508..a2175dd009f5f 100644
--- a/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
+++ b/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
@@ -26,8 +26,8 @@ entry:
 ; CHECK: ld.param.b64 %rd[[A_REG:[0-9]+]], [kernel_func_param_0]
 ; CHECK: cvta.to.global.u64 %rd[[A1_REG:[0-9]+]], %rd[[A_REG]]
 ; CHECK: add.u64 %rd[[SP_REG:[0-9]+]], %SP, 0
-; CHECK: ld.global.b32 %f[[A0_REG:[0-9]+]], [%rd[[A1_REG]]]
-; CHECK: st.local.b32 [{{%rd[0-9]+}}], %f[[A0_REG]]
+; CHECK: ld.global.b32 %r[[A0_REG:[0-9]+]], [%rd[[A1_REG]]]
+; CHECK: st.local.b32 [{{%rd[0-9]+}}], %r[[A0_REG]]
 
   %0 = load float, ptr %a, align 4
   store float %0, ptr %buf, align 4
diff --git a/llvm/test/CodeGen/NVPTX/convert-fp-i8.ll b/llvm/test/CodeGen/NVPTX/convert-fp-i8.ll
index cfb064c85e074..5e856112c0142 100644
--- a/llvm/test/CodeGen/NVPTX/convert-fp-i8.ll
+++ b/llvm/test/CodeGen/NVPTX/convert-fp-i8.ll
@@ -7,14 +7,13 @@ define i8 @cvt_u8_f32(float %x) {
 ; CHECK-LABEL: cvt_u8_f32(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<2>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [cvt_u8_f32_param_0];
-; CHECK-NEXT:    cvt.rzi.u16.f32 %rs1, %f1;
-; CHECK-NEXT:    cvt.u32.u16 %r1, %rs1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ld.param.b32 %r1, [cvt_u8_f32_param_0];
+; CHECK-NEXT:    cvt.rzi.u16.f32 %rs1, %r1;
+; CHECK-NEXT:    cvt.u32.u16 %r2, %rs1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %a = fptoui float %x to i8
   ret i8 %a
@@ -25,11 +24,11 @@ define i8 @cvt_u8_f64(double %x) {
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b64 %fd<2>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b64 %fd1, [cvt_u8_f64_param_0];
-; CHECK-NEXT:    cvt.rzi.u16.f64 %rs1, %fd1;
+; CHECK-NEXT:    ld.param.b64 %rd1, [cvt_u8_f64_param_0];
+; CHECK-NEXT:    cvt.rzi.u16.f64 %rs1, %rd1;
 ; CHECK-NEXT:    cvt.u32.u16 %r1, %rs1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
@@ -41,12 +40,12 @@ define float @cvt_f32_i8(i8 %x) {
 ; CHECK-LABEL: cvt_f32_i8(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
-; CHECK-NEXT:    .reg .b32 %f<2>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b8 %rs1, [cvt_f32_i8_param_0];
-; CHECK-NEXT:    cvt.rn.f32.u16 %f1, %rs1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f1;
+; CHECK-NEXT:    cvt.rn.f32.u16 %r1, %rs1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
   %a = uitofp i8 %x to float
   ret float %a
@@ -56,12 +55,12 @@ define double @cvt_f64_i8(i8 %x) {
 ; CHECK-LABEL: cvt_f64_i8(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
-; CHECK-NEXT:    .reg .b64 %fd<2>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b8 %rs1, [cvt_f64_i8_param_0];
-; CHECK-NEXT:    cvt.rn.f64.u16 %fd1, %rs1;
-; CHECK-NEXT:    st.param.b64 [func_retval0], %fd1;
+; CHECK-NEXT:    cvt.rn.f64.u16 %rd1, %rs1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd1;
 ; CHECK-NEXT:    ret;
   %a = uitofp i8 %x to double
   ret double %a
@@ -71,12 +70,12 @@ define float @cvt_f32_s8(i8 %x) {
 ; CHECK-LABEL: cvt_f32_s8(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
-; CHECK-NEXT:    .reg .b32 %f<2>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.s8 %rs1, [cvt_f32_s8_param_0];
-; CHECK-NEXT:    cvt.rn.f32.s16 %f1, %rs1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f1;
+; CHECK-NEXT:    cvt.rn.f32.s16 %r1, %rs1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
   %a = sitofp i8 %x to float
   ret float %a
@@ -86,12 +85,12 @@ define double @cvt_f64_s8(i8 %x) {
 ; CHECK-LABEL: cvt_f64_s8(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
-; CHECK-NEXT:    .reg .b64 %fd<2>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.s8 %rs1, [cvt_f64_s8_param_0];
-; CHECK-NEXT:    cvt.rn.f64.s16 %fd1, %rs1;
-; CHECK-NEXT:    st.param.b64 [func_retval0], %fd1;
+; CHECK-NEXT:    cvt.rn.f64.s16 %rd1, %rs1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd1;
 ; CHECK-NEXT:    ret;
   %a = sitofp i8 %x to double
   ret double %a
@@ -101,15 +100,14 @@ define i8 @cvt_s8_f32(float %x) {
 ; CHECK-LABEL: cvt_s8_f32(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
-; CHECK-NEXT:    .reg .b32 %r<3>;
-; CHECK-NEXT:    .reg .b32 %f<2>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [cvt_s8_f32_param_0];
-; CHECK-NEXT:    cvt.rzi.s16.f32 %rs1, %f1;
-; CHECK-NEXT:    cvt.u32.u16 %r1, %rs1;
-; CHECK-NEXT:    and.b32 %r2, %r1, 255;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ld.param.b32 %r1, [cvt_s8_f32_param_0];
+; CHECK-NEXT:    cvt.rzi.s16.f32 %rs1, %r1;
+; CHECK-NEXT:    cvt.u32.u16 %r2, %rs1;
+; CHECK-NEXT:    and.b32 %r3, %r2, 255;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
   %a = fptosi float %x to i8
   ret i8 %a
@@ -120,11 +118,11 @@ define i8 @cvt_s8_f64(double %x) {
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-NEXT:    .reg .b32 %r<3>;
-; CHECK-NEXT:    .reg .b64 %fd<2>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b64 %fd1, [cvt_s8_f64_param_0];
-; CHECK-NEXT:    cvt.rzi.s16.f64 %rs1, %fd1;
+; CHECK-NEXT:    ld.param.b64 %rd1, [cvt_s8_f64_param_0];
+; CHECK-NEXT:    cvt.rzi.s16.f64 %rs1, %rd1;
 ; CHECK-NEXT:    cvt.u32.u16 %r1, %rs1;
 ; CHECK-NEXT:    and.b32 %r2, %r1, 255;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
diff --git a/llvm/test/CodeGen/NVPTX/convert-fp.ll b/llvm/test/CodeGen/NVPTX/convert-fp.ll
index 49565863a6ef6..debaadedce09a 100644
--- a/llvm/test/CodeGen/NVPTX/convert-fp.ll
+++ b/llvm/test/CodeGen/NVPTX/convert-fp.ll
@@ -4,161 +4,161 @@
 ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 define i16 @cvt_u16_f32(float %x) {
-; CHECK: cvt.rzi.u16.f32 %rs{{[0-9]+}}, %f{{[0-9]+}};
+; CHECK: cvt.rzi.u16.f32 %rs{{[0-9]+}}, %r{{[0-9]+}};
 ; CHECK: ret;
   %a = fptoui float %x to i16
   ret i16 %a
 }
 define i16 @cvt_u16_f64(double %x) {
-; CHECK: cvt.rzi.u16.f64 %rs{{[0-9]+}}, %fd{{[0-9]+}};
+; CHECK: cvt.rzi.u16.f64 %rs{{[0-9]+}}, %rd{{[0-9]+}};
 ; CHECK: ret;
   %a = fptoui double %x to i16
   ret i16 %a
 }
 define i32 @cvt_u32_f32(float %x) {
-; CHECK: cvt.rzi.u32.f32 %r{{[0-9]+}}, %f{{[0-9]+}};
+; CHECK: cvt.rzi.u32.f32 %r{{[0-9]+}}, %r{{[0-9]+}};
 ; CHECK: ret;
   %a = fptoui float %x to i32
   ret i32 %a
 }
 define i32 @cvt_u32_f64(double %x) {
-; CHECK: cvt.rzi.u32.f64 %r{{[0-9]+}}, %fd{{[0-9]+}};
+; CHECK: cvt.rzi.u32.f64 %r{{[0-9]+}}, %rd{{[0-9]+}};
 ; CHECK: ret;
   %a = fptoui double %x to i32
   ret i32 %a
 }
 define i64 @cvt_u64_f32(float %x) {
-; CHECK: cvt.rzi.u64.f32 %rd{{[0-9]+}}, %f{{[0-9]+}};
+; CHECK: cvt.rzi.u64.f32 %rd{{[0-9]+}}, %r{{[0-9]+}};
 ; CHECK: ret;
   %a = fptoui float %x to i64
   ret i64 %a
 }
 define i64 @cvt_u64_f64(double %x) {
-; CHECK: cvt.rzi.u64.f64 %rd{{[0-9]+}}, %fd{{[0-9]+}};
+; CHECK: cvt.rzi.u64.f64 %rd{{[0-9]+}}, %rd{{[0-9]+}};
 ; CHECK: ret;
   %a = fptoui double %x to i64
   ret i64 %a
 }
 
 define float @cvt_f32_i16(i16 %x) {
-; CHECK: cvt.rn.f32.u16 %f{{[0-9]+}}, %rs{{[0-9]+}};
+; CHECK: cvt.rn.f32.u16 %r{{[0-9]+}}, %rs{{[0-9]+}};
 ; CHECK: ret;
   %a = uitofp i16 %x to float
   ret float %a
 }
 define float @cvt_f32_i32(i32 %x) {
-; CHECK: cvt.rn.f32.u32 %f{{[0-9]+}}, %r{{[0-9]+}};
+; CHECK: cvt.rn.f32.u32 %r{{[0-9]+}}, %r{{[0-9]+}};
 ; CHECK: ret;
   %a = uitofp i32 %x to float
   ret float %a
 }
 define float @cvt_f32_i64(i64 %x) {
-; CHECK: cvt.rn.f32.u64 %f{{[0-9]+}}, %rd{{[0-9]+}};
+; CHECK: cvt.rn.f32.u64 %r{{[0-9]+}}, %rd{{[0-9]+}};
 ; CHECK: ret;
   %a = uitofp i64 %x to float
   ret float %a
 }
 define double @cvt_f64_i16(i16 %x) {
-; CHECK: cvt.rn.f64.u16 %fd{{[0-9]+}}, %rs{{[0-9]+}};
+; CHECK: cvt.rn.f64.u16 %rd{{[0-9]+}}, %rs{{[0-9]+}};
 ; CHECK: ret;
   %a = uitofp i16 %x to double
   ret double %a
 }
 define double @cvt_f64_i32(i32 %x) {
-; CHECK: cvt.rn.f64.u32 %fd{{[0-9]+}}, %r{{[0-9]+}};
+; CHECK: cvt.rn.f64.u32 %rd{{[0-9]+}}, %r{{[0-9]+}};
 ; CHECK: ret;
   %a = uitofp i32 %x to double
   ret double %a
 }
 define double @cvt_f64_i64(i64 %x) {
-; CHECK: cvt.rn.f64.u64 %fd{{[0-9]+}}, %rd{{[0-9]+}};
+; CHECK: cvt.rn.f64.u64 %rd{{[0-9]+}}, %rd{{[0-9]+}};
 ; CHECK: ret;
   %a = uitofp i64 %x to double
   ret double %a
 }
 
 define float @cvt_f32_f64(double %x) {
-; CHECK: cvt.rn.f32.f64 %f{{[0-9]+}}, %fd{{[0-9]+}};
+; CHECK: cvt.rn.f32.f64 %r{{[0-9]+}}, %rd{{[0-9]+}};
 ; CHECK: ret;
   %a = fptrunc double %x to float
   ret float %a
 }
 define double @cvt_f64_f32(float %x) {
-; CHECK: cvt.f64.f32 %fd{{[0-9]+}}, %f{{[0-9]+}};
+; CHECK: cvt.f64.f32 %rd{{[0-9]+}}, %r{{[0-9]+}};
 ; CHECK: ret;
   %a = fpext float %x to double
   ret double %a
 }
 
 define float @cvt_f32_s16(i16 %x) {
-; CHECK: cvt.rn.f32.s16 %f{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: cvt.rn.f32.s16 %r{{[0-9]+}}, %rs{{[0-9]+}}
 ; CHECK: ret
   %a = sitofp i16 %x to float
   ret float %a
 }
 define float @cvt_f32_s32(i32 %x) {
-; CHECK: cvt.rn.f32.s32 %f{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: cvt.rn.f32.s32 %r{{[0-9]+}}, %r{{[0-9]+}}
 ; CHECK: ret
   %a = sitofp i32 %x to float
   ret float %a
 }
 define float @cvt_f32_s64(i64 %x) {
-; CHECK: cvt.rn.f32.s64 %f{{[0-9]+}}, %rd{{[0-9]+}}
+; CHECK: cvt.rn.f32.s64 %r{{[0-9]+}}, %rd{{[0-9]+}}
 ; CHECK: ret
   %a = sitofp i64 %x to float
   ret float %a
 }
 define double @cvt_f64_s16(i16 %x) {
-; CHECK: cvt.rn.f64.s16 %fd{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: cvt.rn.f64.s16 %rd{{[0-9]+}}, %rs{{[0-9]+}}
 ; CHECK: ret
   %a = sitofp i16 %x to double
   ret double %a
 }
 define double @cvt_f64_s32(i32 %x) {
-; CHECK: cvt.rn.f64.s32 %fd{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: cvt.rn.f64.s32 %rd{{[0-9]+}}, %r{{[0-9]+}}
 ; CHECK: ret
   %a = sitofp i32 %x to double
   ret double %a
 }
 define double @cvt_f64_s64(i64 %x) {
-; CHECK: cvt.rn.f64.s64 %fd{{[0-9]+}}, %rd{{[0-9]+}}
+; CHECK: cvt.rn.f64.s64 %rd{{[0-9]+}}, %rd{{[0-9]+}}
 ; CHECK: ret
   %a = sitofp i64 %x to double
   ret double %a
 }
 
 define i16 @cvt_s16_f32(float %x) {
-; CHECK: cvt.rzi.s16.f32 %rs{{[0-9]+}}, %f{{[0-9]+}};
+; CHECK: cvt.rzi.s16.f32 %rs{{[0-9]+}}, %r{{[0-9]+}};
 ; CHECK: ret;
   %a = fptosi float %x to i16
   ret i16 %a
 }
 define i16 @cvt_s16_f64(double %x) {
-; CHECK: cvt.rzi.s16.f64 %rs{{[0-9]+}}, %fd{{[0-9]+}};
+; CHECK: cvt.rzi.s16.f64 %rs{{[0-9]+}}, %rd{{[0-9]+}};
 ; CHECK: ret;
   %a = fptosi double %x to i16
   ret i16 %a
 }
 define i32 @cvt_s32_f32(float %x) {
-; CHECK: cvt.rzi.s32.f32 %r{{[0-9]+}}, %f{{[0-9]+}};
+; CHECK: cvt.rzi.s32.f32 %r{{[0-9]+}}, %r{{[0-9]+}};
 ; CHECK: ret;
   %a = fptosi float %x to i32
   ret i32 %a
 }
 define i32 @cvt_s32_f64(double %x) {
-; CHECK: cvt.rzi.s32.f64 %r{{[0-9]+}}, %fd{{[0-9]+}};
+; CHECK: cvt.rzi.s32.f64 %r{{[0-9]+}}, %rd{{[0-9]+}};
 ; CHECK: ret;
   %a = fptosi double %x to i32
   ret i32 %a
 }
 define i64 @cvt_s64_f32(float %x) {
-; CHECK: cvt.rzi.s64.f32 %rd{{[0-9]+}}, %f{{[0-9]+}};
+; CHECK: cvt.rzi.s64.f32 %rd{{[0-9]+}}, %r{{[0-9]+}};
 ; CHECK: ret;
   %a = fptosi float %x to i64
   ret i64 %a
 }
 define i64 @cvt_s64_f64(double %x) {
-; CHECK: cvt.rzi.s64.f64 %rd{{[0-9]+}}, %fd{{[0-9]+}};
+; CHECK: cvt.rzi.s64.f64 %rd{{[0-9]+}}, %rd{{[0-9]+}};
 ; CHECK: ret;
   %a = fptosi double %x to i64
   ret i64 %a
diff --git a/llvm/test/CodeGen/NVPTX/convert-sm100.ll b/llvm/test/CodeGen/NVPTX/convert-sm100.ll
index d5fe45f8051fb..88d0f32065a76 100644
--- a/llvm/test/CodeGen/NVPTX/convert-sm100.ll
+++ b/llvm/test/CodeGen/NVPTX/convert-sm100.ll
@@ -10,13 +10,12 @@ declare i32 @llvm.nvvm.f2tf32.rz.relu.satfinite(float %f1)
 define i32 @cvt_rn_satf_tf32_f32(float %f1) {
 ; CHECK-LABEL: cvt_rn_satf_tf32_f32(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<2>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rn_satf_tf32_f32_param_0];
-; CHECK-NEXT:    cvt.rn.satfinite.tf32.f32 %r1, %f1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ld.param.b32 %r1, [cvt_rn_satf_tf32_f32_param_0];
+; CHECK-NEXT:    cvt.rn.satfinite.tf32.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %val = call i32 @llvm.nvvm.f2tf32.rn.satfinite(float %f1)
   ret i32 %val
@@ -25,13 +24,12 @@ define i32 @cvt_rn_satf_tf32_f32(float %f1) {
 define i32 @cvt_rn_relu_satf_tf32_f32(float %f1) {
 ; CHECK-LABEL: cvt_rn_relu_satf_tf32_f32(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<2>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rn_relu_satf_tf32_f32_param_0];
-; CHECK-NEXT:    cvt.rn.relu.satfinite.tf32.f32 %r1, %f1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ld.param.b32 %r1, [cvt_rn_relu_satf_tf32_f32_param_0];
+; CHECK-NEXT:    cvt.rn.relu.satfinite.tf32.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %val = call i32 @llvm.nvvm.f2tf32.rn.relu.satfinite(float %f1)
   ret i32 %val
@@ -40,13 +38,12 @@ define i32 @cvt_rn_relu_satf_tf32_f32(float %f1) {
 define i32 @cvt_rz_satf_tf32_f32(float %f1) {
 ; CHECK-LABEL: cvt_rz_satf_tf32_f32(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<2>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rz_satf_tf32_f32_param_0];
-; CHECK-NEXT:    cvt.rz.satfinite.tf32.f32 %r1, %f1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ld.param.b32 %r1, [cvt_rz_satf_tf32_f32_param_0];
+; CHECK-NEXT:    cvt.rz.satfinite.tf32.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %val = call i32 @llvm.nvvm.f2tf32.rz.satfinite(float %f1)
   ret i32 %val
@@ -55,13 +52,12 @@ define i32 @cvt_rz_satf_tf32_f32(float %f1) {
 define i32 @cvt_rz_relu_satf_tf32_f32(float %f1) {
 ; CHECK-LABEL: cvt_rz_relu_satf_tf32_f32(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<2>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rz_relu_satf_tf32_f32_param_0];
-; CHECK-NEXT:    cvt.rz.relu.satfinite.tf32.f32 %r1, %f1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ld.param.b32 %r1, [cvt_rz_relu_satf_tf32_f32_param_0];
+; CHECK-NEXT:    cvt.rz.relu.satfinite.tf32.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %val = call i32 @llvm.nvvm.f2tf32.rz.relu.satfinite(float %f1)
   ret i32 %val
diff --git a/llvm/test/CodeGen/NVPTX/convert-sm100a.ll b/llvm/test/CodeGen/NVPTX/convert-sm100a.ll
index 9acbb7984638a..c8b7014d7bc15 100644
--- a/llvm/test/CodeGen/NVPTX/convert-sm100a.ll
+++ b/llvm/test/CodeGen/NVPTX/convert-sm100a.ll
@@ -10,15 +10,14 @@ define i16 @cvt_rn_sf_e2m3x2_f32(float %f1, float %f2) {
 ; CHECK-LABEL: cvt_rn_sf_e2m3x2_f32(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rn_sf_e2m3x2_f32_param_0];
-; CHECK-NEXT:    ld.param.b32 %f2, [cvt_rn_sf_e2m3x2_f32_param_1];
-; CHECK-NEXT:    cvt.rn.satfinite.e2m3x2.f32 %rs1, %f1, %f2;
-; CHECK-NEXT:    cvt.u32.u16 %r1, %rs1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ld.param.b32 %r1, [cvt_rn_sf_e2m3x2_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [cvt_rn_sf_e2m3x2_f32_param_1];
+; CHECK-NEXT:    cvt.rn.satfinite.e2m3x2.f32 %rs1, %r1, %r2;
+; CHECK-NEXT:    cvt.u32.u16 %r3, %rs1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
     %val = call i16 @llvm.nvvm.ff.to.e2m3x2.rn.satfinite(float %f1, float %f2)
     ret i16 %val
@@ -28,15 +27,14 @@ define i16 @cvt_rn_relu_sf_e2m3x2_f32(float %f1, float %f2) {
 ; CHECK-LABEL: cvt_rn_relu_sf_e2m3x2_f32(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rn_relu_sf_e2m3x2_f32_param_0];
-; CHECK-NEXT:    ld.param.b32 %f2, [cvt_rn_relu_sf_e2m3x2_f32_param_1];
-; CHECK-NEXT:    cvt.rn.satfinite.relu.e2m3x2.f32 %rs1, %f1, %f2;
-; CHECK-NEXT:    cvt.u32.u16 %r1, %rs1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ld.param.b32 %r1, [cvt_rn_relu_sf_e2m3x2_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [cvt_rn_relu_sf_e2m3x2_f32_param_1];
+; CHECK-NEXT:    cvt.rn.satfinite.relu.e2m3x2.f32 %rs1, %r1, %r2;
+; CHECK-NEXT:    cvt.u32.u16 %r3, %rs1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
     %val = call i16 @llvm.nvvm.ff.to.e2m3x2.rn.relu.satfinite(float %f1, float %f2)
     ret i16 %val
@@ -46,15 +44,14 @@ define i16 @cvt_rn_sf_e3m2x2_f32(float %f1, float %f2) {
 ; CHECK-LABEL: cvt_rn_sf_e3m2x2_f32(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rn_sf_e3m2x2_f32_param_0];
-; CHECK-NEXT:    ld.param.b32 %f2, [cvt_rn_sf_e3m2x2_f32_param_1];
-; CHECK-NEXT:    cvt.rn.satfinite.e3m2x2.f32 %rs1, %f1, %f2;
-; CHECK-NEXT:    cvt.u32.u16 %r1, %rs1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ld.param.b32 %r1, [cvt_rn_sf_e3m2x2_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [cvt_rn_sf_e3m2x2_f32_param_1];
+; CHECK-NEXT:    cvt.rn.satfinite.e3m2x2.f32 %rs1, %r1, %r2;
+; CHECK-NEXT:    cvt.u32.u16 %r3, %rs1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
     %val = call i16 @llvm.nvvm.ff.to.e3m2x2.rn.satfinite(float %f1, float %f2)
     ret i16 %val
@@ -64,15 +61,14 @@ define i16 @cvt_rn_relu_sf_e3m2x2_f32(float %f1, float %f2) {
 ; CHECK-LABEL: cvt_rn_relu_sf_e3m2x2_f32(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rn_relu_sf_e3m2x2_f32_param_0];
-; CHECK-NEXT:    ld.param.b32 %f2, [cvt_rn_relu_sf_e3m2x2_f32_param_1];
-; CHECK-NEXT:    cvt.rn.satfinite.relu.e3m2x2.f32 %rs1, %f1, %f2;
-; CHECK-NEXT:    cvt.u32.u16 %r1, %rs1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ld.param.b32 %r1, [cvt_rn_relu_sf_e3m2x2_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [cvt_rn_relu_sf_e3m2x2_f32_param_1];
+; CHECK-NEXT:    cvt.rn.satfinite.relu.e3m2x2.f32 %rs1, %r1, %r2;
+; CHECK-NEXT:    cvt.u32.u16 %r3, %rs1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
     %val = call i16 @llvm.nvvm.ff.to.e3m2x2.rn.relu.satfinite(float %f1, float %f2)
     ret i16 %val
@@ -142,15 +138,14 @@ define i16 @cvt_rz_ue8m0x2_f32(float %f1, float %f2) {
 ; CHECK-LABEL: cvt_rz_ue8m0x2_f32(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rz_ue8m0x2_f32_param_0];
-; CHECK-NEXT:    ld.param.b32 %f2, [cvt_rz_ue8m0x2_f32_param_1];
-; CHECK-NEXT:    cvt.rz.ue8m0x2.f32 %rs1, %f1, %f2;
-; CHECK-NEXT:    cvt.u32.u16 %r1, %rs1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ld.param.b32 %r1, [cvt_rz_ue8m0x2_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [cvt_rz_ue8m0x2_f32_param_1];
+; CHECK-NEXT:    cvt.rz.ue8m0x2.f32 %rs1, %r1, %r2;
+; CHECK-NEXT:    cvt.u32.u16 %r3, %rs1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
     %val = call i16 @llvm.nvvm.ff.to.ue8m0x2.rz(float %f1, float %f2)
     ret i16 %val
@@ -160,15 +155,14 @@ define i16 @cvt_rz_sf_ue8m0x2_f32(float %f1, float %f2) {
 ; CHECK-LABEL: cvt_rz_sf_ue8m0x2_f32(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rz_sf_ue8m0x2_f32_param_0];
-; CHECK-NEXT:    ld.param.b32 %f2, [cvt_rz_sf_ue8m0x2_f32_param_1];
-; CHECK-NEXT:    cvt.rz.satfinite.ue8m0x2.f32 %rs1, %f1, %f2;
-; CHECK-NEXT:    cvt.u32.u16 %r1, %rs1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ld.param.b32 %r1, [cvt_rz_sf_ue8m0x2_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [cvt_rz_sf_ue8m0x2_f32_param_1];
+; CHECK-NEXT:    cvt.rz.satfinite.ue8m0x2.f32 %rs1, %r1, %r2;
+; CHECK-NEXT:    cvt.u32.u16 %r3, %rs1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
     %val = call i16 @llvm.nvvm.ff.to.ue8m0x2.rz.satfinite(float %f1, float %f2)
     ret i16 %val
@@ -178,15 +172,14 @@ define i16 @cvt_rp_ue8m0x2_f32(float %f1, float %f2) {
 ; CHECK-LABEL: cvt_rp_ue8m0x2_f32(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rp_ue8m0x2_f32_param_0];
-; CHECK-NEXT:    ld.param.b32 %f2, [cvt_rp_ue8m0x2_f32_param_1];
-; CHECK-NEXT:    cvt.rp.ue8m0x2.f32 %rs1, %f1, %f2;
-; CHECK-NEXT:    cvt.u32.u16 %r1, %rs1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ld.param.b32 %r1, [cvt_rp_ue8m0x2_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [cvt_rp_ue8m0x2_f32_param_1];
+; CHECK-NEXT:    cvt.rp.ue8m0x2.f32 %rs1, %r1, %r2;
+; CHECK-NEXT:    cvt.u32.u16 %r3, %rs1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
     %val = call i16 @llvm.nvvm.ff.to.ue8m0x2.rp(float %f1, float %f2)
     ret i16 %val
@@ -196,15 +189,14 @@ define i16 @cvt_rp_sf_ue8m0x2_f32(float %f1, float %f2) {
 ; CHECK-LABEL: cvt_rp_sf_ue8m0x2_f32(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rp_sf_ue8m0x2_f32_param_0];
-; CHECK-NEXT:    ld.param.b32 %f2, [cvt_rp_sf_ue8m0x2_f32_param_1];
-; CHECK-NEXT:    cvt.rp.satfinite.ue8m0x2.f32 %rs1, %f1, %f2;
-; CHECK-NEXT:    cvt.u32.u16 %r1, %rs1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ld.param.b32 %r1, [cvt_rp_sf_ue8m0x2_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [cvt_rp_sf_ue8m0x2_f32_param_1];
+; CHECK-NEXT:    cvt.rp.satfinite.ue8m0x2.f32 %rs1, %r1, %r2;
+; CHECK-NEXT:    cvt.u32.u16 %r3, %rs1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
     %val = call i16 @llvm.nvvm.ff.to.ue8m0x2.rp.satfinite(float %f1, float %f2)
     ret i16 %val
@@ -293,19 +285,18 @@ define i16 @cvt_rn_sf_e2m1x2_f32(float %f1, float %f2) {
 ; CHECK-LABEL: cvt_rn_sf_e2m1x2_f32(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rn_sf_e2m1x2_f32_param_0];
-; CHECK-NEXT:    ld.param.b32 %f2, [cvt_rn_sf_e2m1x2_f32_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [cvt_rn_sf_e2m1x2_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [cvt_rn_sf_e2m1x2_f32_param_1];
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:    .reg .b8 %e2m1x2_out;
-; CHECK-NEXT:    cvt.rn.satfinite.e2m1x2.f32 %e2m1x2_out, %f1, %f2;
+; CHECK-NEXT:    cvt.rn.satfinite.e2m1x2.f32 %e2m1x2_out, %r1, %r2;
 ; CHECK-NEXT:    cvt.u16.u8 %rs1, %e2m1x2_out;
 ; CHECK-NEXT:    }
-; CHECK-NEXT:    cvt.u32.u16 %r1, %rs1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    cvt.u32.u16 %r3, %rs1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
     %val = call i16 @llvm.nvvm.ff.to.e2m1x2.rn.satfinite(float %f1, float %f2)
     ret i16 %val
@@ -315,19 +306,18 @@ define i16 @cvt_rn_relu_sf_e2m1x2_f32(float %f1, float %f2) {
 ; CHECK-LABEL: cvt_rn_relu_sf_e2m1x2_f32(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rn_relu_sf_e2m1x2_f32_param_0];
-; CHECK-NEXT:    ld.param.b32 %f2, [cvt_rn_relu_sf_e2m1x2_f32_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [cvt_rn_relu_sf_e2m1x2_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [cvt_rn_relu_sf_e2m1x2_f32_param_1];
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:    .reg .b8 %e2m1x2_out;
-; CHECK-NEXT:    cvt.rn.satfinite.relu.e2m1x2.f32 %e2m1x2_out, %f1, %f2;
+; CHECK-NEXT:    cvt.rn.satfinite.relu.e2m1x2.f32 %e2m1x2_out, %r1, %r2;
 ; CHECK-NEXT:    cvt.u16.u8 %rs1, %e2m1x2_out;
 ; CHECK-NEXT:    }
-; CHECK-NEXT:    cvt.u32.u16 %r1, %rs1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    cvt.u32.u16 %r3, %rs1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
     %val = call i16 @llvm.nvvm.ff.to.e2m1x2.rn.relu.satfinite(float %f1, float %f2)
     ret i16 %val
diff --git a/llvm/test/CodeGen/NVPTX/convert-sm80.ll b/llvm/test/CodeGen/NVPTX/convert-sm80.ll
index 0372d281ea355..9ddeb2bb9e94a 100644
--- a/llvm/test/CodeGen/NVPTX/convert-sm80.ll
+++ b/llvm/test/CodeGen/NVPTX/convert-sm80.ll
@@ -6,14 +6,13 @@
 define <2 x bfloat> @cvt_rn_bf16x2_f32(float %f1, float %f2) {
 ; CHECK-LABEL: cvt_rn_bf16x2_f32(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rn_bf16x2_f32_param_0];
-; CHECK-NEXT:    ld.param.b32 %f2, [cvt_rn_bf16x2_f32_param_1];
-; CHECK-NEXT:    cvt.rn.bf16x2.f32 %r1, %f1, %f2;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ld.param.b32 %r1, [cvt_rn_bf16x2_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [cvt_rn_bf16x2_f32_param_1];
+; CHECK-NEXT:    cvt.rn.bf16x2.f32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
   %val = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rn(float %f1, float %f2)
   ret <2 x bfloat> %val
@@ -22,14 +21,13 @@ define <2 x bfloat> @cvt_rn_bf16x2_f32(float %f1, float %f2) {
 define <2 x bfloat> @cvt_rn_relu_bf16x2_f32(float %f1, float %f2) {
 ; CHECK-LABEL: cvt_rn_relu_bf16x2_f32(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rn_relu_bf16x2_f32_param_0];
-; CHECK-NEXT:    ld.param.b32 %f2, [cvt_rn_relu_bf16x2_f32_param_1];
-; CHECK-NEXT:    cvt.rn.relu.bf16x2.f32 %r1, %f1, %f2;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ld.param.b32 %r1, [cvt_rn_relu_bf16x2_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [cvt_rn_relu_bf16x2_f32_param_1];
+; CHECK-NEXT:    cvt.rn.relu.bf16x2.f32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
   %val = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rn.relu(float %f1, float %f2)
   ret <2 x bfloat> %val
@@ -38,14 +36,13 @@ define <2 x bfloat> @cvt_rn_relu_bf16x2_f32(float %f1, float %f2) {
 define <2 x bfloat> @cvt_rz_bf16x2_f32(float %f1, float %f2) {
 ; CHECK-LABEL: cvt_rz_bf16x2_f32(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rz_bf16x2_f32_param_0];
-; CHECK-NEXT:    ld.param.b32 %f2, [cvt_rz_bf16x2_f32_param_1];
-; CHECK-NEXT:    cvt.rz.bf16x2.f32 %r1, %f1, %f2;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ld.param.b32 %r1, [cvt_rz_bf16x2_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [cvt_rz_bf16x2_f32_param_1];
+; CHECK-NEXT:    cvt.rz.bf16x2.f32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
   %val = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rz(float %f1, float %f2)
   ret <2 x bfloat> %val
@@ -54,14 +51,13 @@ define <2 x bfloat> @cvt_rz_bf16x2_f32(float %f1, float %f2) {
 define <2 x bfloat> @cvt_rz_relu_bf16x2_f32(float %f1, float %f2) {
 ; CHECK-LABEL: cvt_rz_relu_bf16x2_f32(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rz_relu_bf16x2_f32_param_0];
-; CHECK-NEXT:    ld.param.b32 %f2, [cvt_rz_relu_bf16x2_f32_param_1];
-; CHECK-NEXT:    cvt.rz.relu.bf16x2.f32 %r1, %f1, %f2;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ld.param.b32 %r1, [cvt_rz_relu_bf16x2_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [cvt_rz_relu_bf16x2_f32_param_1];
+; CHECK-NEXT:    cvt.rz.relu.bf16x2.f32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
   %val = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rz.relu(float %f1, float %f2)
   ret <2 x bfloat> %val
@@ -75,14 +71,13 @@ declare <2 x bfloat> @llvm.nvvm.ff2bf16x2.rz.relu(float, float)
 define <2 x half> @cvt_rn_f16x2_f32(float %f1, float %f2) {
 ; CHECK-LABEL: cvt_rn_f16x2_f32(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rn_f16x2_f32_param_0];
-; CHECK-NEXT:    ld.param.b32 %f2, [cvt_rn_f16x2_f32_param_1];
-; CHECK-NEXT:    cvt.rn.f16x2.f32 %r1, %f1, %f2;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ld.param.b32 %r1, [cvt_rn_f16x2_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [cvt_rn_f16x2_f32_param_1];
+; CHECK-NEXT:    cvt.rn.f16x2.f32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
   %val = call <2 x half> @llvm.nvvm.ff2f16x2.rn(float %f1, float %f2)
   ret <2 x half> %val
@@ -91,14 +86,13 @@ define <2 x half> @cvt_rn_f16x2_f32(float %f1, float %f2) {
 define <2 x half> @cvt_rn_relu_f16x2_f32(float %f1, float %f2) {
 ; CHECK-LABEL: cvt_rn_relu_f16x2_f32(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rn_relu_f16x2_f32_param_0];
-; CHECK-NEXT:    ld.param.b32 %f2, [cvt_rn_relu_f16x2_f32_param_1];
-; CHECK-NEXT:    cvt.rn.relu.f16x2.f32 %r1, %f1, %f2;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ld.param.b32 %r1, [cvt_rn_relu_f16x2_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [cvt_rn_relu_f16x2_f32_param_1];
+; CHECK-NEXT:    cvt.rn.relu.f16x2.f32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
   %val = call <2 x half> @llvm.nvvm.ff2f16x2.rn.relu(float %f1, float %f2)
   ret <2 x half> %val
@@ -107,14 +101,13 @@ define <2 x half> @cvt_rn_relu_f16x2_f32(float %f1, float %f2) {
 define <2 x half> @cvt_rz_f16x2_f32(float %f1, float %f2) {
 ; CHECK-LABEL: cvt_rz_f16x2_f32(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rz_f16x2_f32_param_0];
-; CHECK-NEXT:    ld.param.b32 %f2, [cvt_rz_f16x2_f32_param_1];
-; CHECK-NEXT:    cvt.rz.f16x2.f32 %r1, %f1, %f2;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ld.param.b32 %r1, [cvt_rz_f16x2_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [cvt_rz_f16x2_f32_param_1];
+; CHECK-NEXT:    cvt.rz.f16x2.f32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
   %val = call <2 x half> @llvm.nvvm.ff2f16x2.rz(float %f1, float %f2)
   ret <2 x half> %val
@@ -123,14 +116,13 @@ define <2 x half> @cvt_rz_f16x2_f32(float %f1, float %f2) {
 define <2 x half> @cvt_rz_relu_f16x2_f32(float %f1, float %f2) {
 ; CHECK-LABEL: cvt_rz_relu_f16x2_f32(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rz_relu_f16x2_f32_param_0];
-; CHECK-NEXT:    ld.param.b32 %f2, [cvt_rz_relu_f16x2_f32_param_1];
-; CHECK-NEXT:    cvt.rz.relu.f16x2.f32 %r1, %f1, %f2;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ld.param.b32 %r1, [cvt_rz_relu_f16x2_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [cvt_rz_relu_f16x2_f32_param_1];
+; CHECK-NEXT:    cvt.rz.relu.f16x2.f32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
   %val = call <2 x half> @llvm.nvvm.ff2f16x2.rz.relu(float %f1, float %f2)
   ret <2 x half> %val
@@ -145,11 +137,11 @@ define bfloat @cvt_rn_bf16_f32(float %f1) {
 ; CHECK-LABEL: cvt_rn_bf16_f32(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
-; CHECK-NEXT:    .reg .b32 %f<2>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rn_bf16_f32_param_0];
-; CHECK-NEXT:    cvt.rn.bf16.f32 %rs1, %f1;
+; CHECK-NEXT:    ld.param.b32 %r1, [cvt_rn_bf16_f32_param_0];
+; CHECK-NEXT:    cvt.rn.bf16.f32 %rs1, %r1;
 ; CHECK-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; CHECK-NEXT:    ret;
   %val = call bfloat @llvm.nvvm.f2bf16.rn(float %f1)
@@ -160,11 +152,11 @@ define bfloat @cvt_rn_relu_bf16_f32(float %f1) {
 ; CHECK-LABEL: cvt_rn_relu_bf16_f32(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
-; CHECK-NEXT:    .reg .b32 %f<2>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rn_relu_bf16_f32_param_0];
-; CHECK-NEXT:    cvt.rn.relu.bf16.f32 %rs1, %f1;
+; CHECK-NEXT:    ld.param.b32 %r1, [cvt_rn_relu_bf16_f32_param_0];
+; CHECK-NEXT:    cvt.rn.relu.bf16.f32 %rs1, %r1;
 ; CHECK-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; CHECK-NEXT:    ret;
   %val = call bfloat @llvm.nvvm.f2bf16.rn.relu(float %f1)
@@ -175,11 +167,11 @@ define bfloat @cvt_rz_bf16_f32(float %f1) {
 ; CHECK-LABEL: cvt_rz_bf16_f32(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
-; CHECK-NEXT:    .reg .b32 %f<2>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rz_bf16_f32_param_0];
-; CHECK-NEXT:    cvt.rz.bf16.f32 %rs1, %f1;
+; CHECK-NEXT:    ld.param.b32 %r1, [cvt_rz_bf16_f32_param_0];
+; CHECK-NEXT:    cvt.rz.bf16.f32 %rs1, %r1;
 ; CHECK-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; CHECK-NEXT:    ret;
   %val = call bfloat @llvm.nvvm.f2bf16.rz(float %f1)
@@ -190,11 +182,11 @@ define bfloat @cvt_rz_relu_bf16_f32(float %f1) {
 ; CHECK-LABEL: cvt_rz_relu_bf16_f32(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
-; CHECK-NEXT:    .reg .b32 %f<2>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rz_relu_bf16_f32_param_0];
-; CHECK-NEXT:    cvt.rz.relu.bf16.f32 %rs1, %f1;
+; CHECK-NEXT:    ld.param.b32 %r1, [cvt_rz_relu_bf16_f32_param_0];
+; CHECK-NEXT:    cvt.rz.relu.bf16.f32 %rs1, %r1;
 ; CHECK-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; CHECK-NEXT:    ret;
   %val = call bfloat @llvm.nvvm.f2bf16.rz.relu(float %f1)
@@ -209,13 +201,12 @@ declare bfloat @llvm.nvvm.f2bf16.rz.relu(float)
 define i32 @cvt_rna_tf32_f32(float %f1) {
 ; CHECK-LABEL: cvt_rna_tf32_f32(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<2>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rna_tf32_f32_param_0];
-; CHECK-NEXT:    cvt.rna.tf32.f32 %r1, %f1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ld.param.b32 %r1, [cvt_rna_tf32_f32_param_0];
+; CHECK-NEXT:    cvt.rna.tf32.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %val = call i32 @llvm.nvvm.f2tf32.rna(float %f1)
   ret i32 %val
@@ -227,14 +218,13 @@ declare i32 @llvm.nvvm.f2tf32.rna(float)
 define <2 x bfloat> @fold_ff2bf16x2(float %lo, float %hi) {
 ; CHECK-LABEL: fold_ff2bf16x2(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [fold_ff2bf16x2_param_0];
-; CHECK-NEXT:    ld.param.b32 %f2, [fold_ff2bf16x2_param_1];
-; CHECK-NEXT:    cvt.rn.bf16x2.f32 %r1, %f2, %f1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ld.param.b32 %r1, [fold_ff2bf16x2_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [fold_ff2bf16x2_param_1];
+; CHECK-NEXT:    cvt.rn.bf16x2.f32 %r3, %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
   %loh = fptrunc float %lo to bfloat
   %hih = fptrunc float %hi to bfloat
@@ -246,14 +236,13 @@ define <2 x bfloat> @fold_ff2bf16x2(float %lo, float %hi) {
 define <2 x half> @fold_ff2f16x2(float %lo, float %hi) {
 ; CHECK-LABEL: fold_ff2f16x2(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [fold_ff2f16x2_param_0];
-; CHECK-NEXT:    ld.param.b32 %f2, [fold_ff2f16x2_param_1];
-; CHECK-NEXT:    cvt.rn.f16x2.f32 %r1, %f2, %f1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ld.param.b32 %r1, [fold_ff2f16x2_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [fold_ff2f16x2_param_1];
+; CHECK-NEXT:    cvt.rn.f16x2.f32 %r3, %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
   %loh = fptrunc float %lo to half
   %hih = fptrunc float %hi to half
diff --git a/llvm/test/CodeGen/NVPTX/convert-sm90.ll b/llvm/test/CodeGen/NVPTX/convert-sm90.ll
index dba8be1ef5a49..c74ceac03d750 100644
--- a/llvm/test/CodeGen/NVPTX/convert-sm90.ll
+++ b/llvm/test/CodeGen/NVPTX/convert-sm90.ll
@@ -10,13 +10,12 @@ declare i32 @llvm.nvvm.f2tf32.rz.relu(float %f1)
 define i32 @cvt_rn_tf32_f32(float %f1) {
 ; CHECK-LABEL: cvt_rn_tf32_f32(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<2>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rn_tf32_f32_param_0];
-; CHECK-NEXT:    cvt.rn.tf32.f32 %r1, %f1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ld.param.b32 %r1, [cvt_rn_tf32_f32_param_0];
+; CHECK-NEXT:    cvt.rn.tf32.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %val = call i32 @llvm.nvvm.f2tf32.rn(float %f1)
   ret i32 %val
@@ -25,13 +24,12 @@ define i32 @cvt_rn_tf32_f32(float %f1) {
 define i32 @cvt_rn_relu_tf32_f32(float %f1) {
 ; CHECK-LABEL: cvt_rn_relu_tf32_f32(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<2>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rn_relu_tf32_f32_param_0];
-; CHECK-NEXT:    cvt.rn.relu.tf32.f32 %r1, %f1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ld.param.b32 %r1, [cvt_rn_relu_tf32_f32_param_0];
+; CHECK-NEXT:    cvt.rn.relu.tf32.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %val = call i32 @llvm.nvvm.f2tf32.rn.relu(float %f1)
   ret i32 %val
@@ -40,13 +38,12 @@ define i32 @cvt_rn_relu_tf32_f32(float %f1) {
 define i32 @cvt_rz_tf32_f32(float %f1) {
 ; CHECK-LABEL: cvt_rz_tf32_f32(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<2>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rz_tf32_f32_param_0];
-; CHECK-NEXT:    cvt.rz.tf32.f32 %r1, %f1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ld.param.b32 %r1, [cvt_rz_tf32_f32_param_0];
+; CHECK-NEXT:    cvt.rz.tf32.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %val = call i32 @llvm.nvvm.f2tf32.rz(float %f1)
   ret i32 %val
@@ -55,13 +52,12 @@ define i32 @cvt_rz_tf32_f32(float %f1) {
 define i32 @cvt_rz_relu_tf32_f32(float %f1) {
 ; CHECK-LABEL: cvt_rz_relu_tf32_f32(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<2>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rz_relu_tf32_f32_param_0];
-; CHECK-NEXT:    cvt.rz.relu.tf32.f32 %r1, %f1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ld.param.b32 %r1, [cvt_rz_relu_tf32_f32_param_0];
+; CHECK-NEXT:    cvt.rz.relu.tf32.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %val = call i32 @llvm.nvvm.f2tf32.rz.relu(float %f1)
   ret i32 %val
diff --git a/llvm/test/CodeGen/NVPTX/copysign.ll b/llvm/test/CodeGen/NVPTX/copysign.ll
index d8198182220e9..a94f8669470ec 100644
--- a/llvm/test/CodeGen/NVPTX/copysign.ll
+++ b/llvm/test/CodeGen/NVPTX/copysign.ll
@@ -8,13 +8,13 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 define float @fcopysign_f_f(float %a, float %b) {
 ; CHECK-LABEL: fcopysign_f_f(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<4>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [fcopysign_f_f_param_0];
-; CHECK-NEXT:    ld.param.b32 %f2, [fcopysign_f_f_param_1];
-; CHECK-NEXT:    copysign.f32 %f3, %f2, %f1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f3;
+; CHECK-NEXT:    ld.param.b32 %r1, [fcopysign_f_f_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [fcopysign_f_f_param_1];
+; CHECK-NEXT:    copysign.f32 %r3, %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
   %val = call float @llvm.copysign.f32(float %a, float %b)
   ret float %val
@@ -23,13 +23,13 @@ define float @fcopysign_f_f(float %a, float %b) {
 define double @fcopysign_d_d(double %a, double %b) {
 ; CHECK-LABEL: fcopysign_d_d(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %fd<4>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b64 %fd1, [fcopysign_d_d_param_0];
-; CHECK-NEXT:    ld.param.b64 %fd2, [fcopysign_d_d_param_1];
-; CHECK-NEXT:    copysign.f64 %fd3, %fd2, %fd1;
-; CHECK-NEXT:    st.param.b64 [func_retval0], %fd3;
+; CHECK-NEXT:    ld.param.b64 %rd1, [fcopysign_d_d_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [fcopysign_d_d_param_1];
+; CHECK-NEXT:    copysign.f64 %rd3, %rd2, %rd1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; CHECK-NEXT:    ret;
   %val = call double @llvm.copysign.f64(double %a, double %b)
   ret double %val
@@ -39,19 +39,19 @@ define float @fcopysign_f_d(float %a, double %b) {
 ; CHECK-LABEL: fcopysign_f_d(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<2>;
-; CHECK-NEXT:    .reg .b32 %f<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [fcopysign_f_d_param_0];
-; CHECK-NEXT:    abs.f32 %f2, %f1;
-; CHECK-NEXT:    neg.f32 %f3, %f2;
+; CHECK-NEXT:    ld.param.b32 %r1, [fcopysign_f_d_param_0];
+; CHECK-NEXT:    abs.f32 %r2, %r1;
+; CHECK-NEXT:    neg.f32 %r3, %r2;
 ; CHECK-NEXT:    ld.param.b64 %rd1, [fcopysign_f_d_param_1];
 ; CHECK-NEXT:    shr.u64 %rd2, %rd1, 63;
 ; CHECK-NEXT:    and.b64 %rd3, %rd2, 1;
 ; CHECK-NEXT:    setp.ne.b64 %p1, %rd3, 0;
-; CHECK-NEXT:    selp.f32 %f4, %f3, %f2, %p1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f4;
+; CHECK-NEXT:    selp.f32 %r4, %r3, %r2, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
 ; CHECK-NEXT:    ret;
   %c = fptrunc double %b to float
   %val = call float @llvm.copysign.f32(float %a, float %c)
@@ -63,18 +63,18 @@ define float @fcopysign_f_h(float %a, half %b) {
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<2>;
 ; CHECK-NEXT:    .reg .b16 %rs<4>;
-; CHECK-NEXT:    .reg .b32 %f<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [fcopysign_f_h_param_0];
-; CHECK-NEXT:    abs.f32 %f2, %f1;
-; CHECK-NEXT:    neg.f32 %f3, %f2;
+; CHECK-NEXT:    ld.param.b32 %r1, [fcopysign_f_h_param_0];
+; CHECK-NEXT:    abs.f32 %r2, %r1;
+; CHECK-NEXT:    neg.f32 %r3, %r2;
 ; CHECK-NEXT:    ld.param.b16 %rs1, [fcopysign_f_h_param_1];
 ; CHECK-NEXT:    shr.u16 %rs2, %rs1, 15;
 ; CHECK-NEXT:    and.b16 %rs3, %rs2, 1;
 ; CHECK-NEXT:    setp.ne.b16 %p1, %rs3, 0;
-; CHECK-NEXT:    selp.f32 %f4, %f3, %f2, %p1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f4;
+; CHECK-NEXT:    selp.f32 %r4, %r3, %r2, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
 ; CHECK-NEXT:    ret;
   %c = fpext half %b to float
   %val = call float @llvm.copysign.f32(float %a, float %c)
@@ -86,18 +86,18 @@ define double @fcopysign_d_f(double %a, float %b) {
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<2>;
 ; CHECK-NEXT:    .reg .b32 %r<4>;
-; CHECK-NEXT:    .reg .b64 %fd<5>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b64 %fd1, [fcopysign_d_f_param_0];
-; CHECK-NEXT:    abs.f64 %fd2, %fd1;
-; CHECK-NEXT:    neg.f64 %fd3, %fd2;
+; CHECK-NEXT:    ld.param.b64 %rd1, [fcopysign_d_f_param_0];
+; CHECK-NEXT:    abs.f64 %rd2, %rd1;
+; CHECK-NEXT:    neg.f64 %rd3, %rd2;
 ; CHECK-NEXT:    ld.param.b32 %r1, [fcopysign_d_f_param_1];
 ; CHECK-NEXT:    shr.u32 %r2, %r1, 31;
 ; CHECK-NEXT:    and.b32 %r3, %r2, 1;
 ; CHECK-NEXT:    setp.ne.b32 %p1, %r3, 0;
-; CHECK-NEXT:    selp.f64 %fd4, %fd3, %fd2, %p1;
-; CHECK-NEXT:    st.param.b64 [func_retval0], %fd4;
+; CHECK-NEXT:    selp.f64 %rd4, %rd3, %rd2, %p1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd4;
 ; CHECK-NEXT:    ret;
   %c = fpext float %b to double
   %val = call double @llvm.copysign.f64(double %a, double %c)
@@ -109,18 +109,18 @@ define double @fcopysign_d_h(double %a, half %b) {
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<2>;
 ; CHECK-NEXT:    .reg .b16 %rs<4>;
-; CHECK-NEXT:    .reg .b64 %fd<5>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b64 %fd1, [fcopysign_d_h_param_0];
-; CHECK-NEXT:    abs.f64 %fd2, %fd1;
-; CHECK-NEXT:    neg.f64 %fd3, %fd2;
+; CHECK-NEXT:    ld.param.b64 %rd1, [fcopysign_d_h_param_0];
+; CHECK-NEXT:    abs.f64 %rd2, %rd1;
+; CHECK-NEXT:    neg.f64 %rd3, %rd2;
 ; CHECK-NEXT:    ld.param.b16 %rs1, [fcopysign_d_h_param_1];
 ; CHECK-NEXT:    shr.u16 %rs2, %rs1, 15;
 ; CHECK-NEXT:    and.b16 %rs3, %rs2, 1;
 ; CHECK-NEXT:    setp.ne.b16 %p1, %rs3, 0;
-; CHECK-NEXT:    selp.f64 %fd4, %fd3, %fd2, %p1;
-; CHECK-NEXT:    st.param.b64 [func_retval0], %fd4;
+; CHECK-NEXT:    selp.f64 %rd4, %rd3, %rd2, %p1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd4;
 ; CHECK-NEXT:    ret;
   %c = fpext half %b to double
   %val = call double @llvm.copysign.f64(double %a, double %c)
diff --git a/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll b/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll
index cea3ac37c1964..193cf674ecdfc 100644
--- a/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll
+++ b/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll
@@ -59,9 +59,8 @@ define void @test_distributed_shared_cluster_float_atomic(ptr addrspace(7) %dsme
 ; CHECK-LABEL: test_distributed_shared_cluster_float_atomic(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<5>;
-; CHECK-NEXT:    .reg .b32 %f<2>;
-; CHECK-NEXT:    .reg .b64 %rd<2>;
-; CHECK-NEXT:    .reg .b64 %fd<2>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    ld.param.b64 %rd1, [test_distributed_shared_cluster_float_atomic_param_0];
@@ -69,8 +68,8 @@ define void @test_distributed_shared_cluster_float_atomic(ptr addrspace(7) %dsme
 ; CHECK-NEXT:    atom.shared::cluster.add.noftz.f16 %rs2, [%rd1], %rs1;
 ; CHECK-NEXT:    mov.b16 %rs3, 0x3F80;
 ; CHECK-NEXT:    atom.shared::cluster.add.noftz.bf16 %rs4, [%rd1], %rs3;
-; CHECK-NEXT:    atom.shared::cluster.add.f32 %f1, [%rd1], 0f3F800000;
-; CHECK-NEXT:    atom.shared::cluster.add.f64 %fd1, [%rd1], 0d3FF0000000000000;
+; CHECK-NEXT:    atom.shared::cluster.add.f32 %r1, [%rd1], 0f3F800000;
+; CHECK-NEXT:    atom.shared::cluster.add.f64 %rd2, [%rd1], 0d3FF0000000000000;
 ; CHECK-NEXT:    ret;
 entry:
   ; Floating point atomic operations
diff --git a/llvm/test/CodeGen/NVPTX/div.ll b/llvm/test/CodeGen/NVPTX/div.ll
index bd8d9a35eed46..885ce8c5030fb 100644
--- a/llvm/test/CodeGen/NVPTX/div.ll
+++ b/llvm/test/CodeGen/NVPTX/div.ll
@@ -5,18 +5,18 @@
 define float @div_full(float %a, float %b) {
 ; CHECK-LABEL: div_full(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<9>;
+; CHECK-NEXT:    .reg .b32 %r<9>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [div_full_param_0];
-; CHECK-NEXT:    ld.param.b32 %f2, [div_full_param_1];
-; CHECK-NEXT:    div.full.f32 %f3, %f1, %f2;
-; CHECK-NEXT:    mov.b32 %f4, 0f40400000;
-; CHECK-NEXT:    div.full.f32 %f5, %f3, %f4;
-; CHECK-NEXT:    div.full.ftz.f32 %f6, %f5, %f2;
-; CHECK-NEXT:    mov.b32 %f7, 0f40800000;
-; CHECK-NEXT:    div.full.ftz.f32 %f8, %f6, %f7;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f8;
+; CHECK-NEXT:    ld.param.b32 %r1, [div_full_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [div_full_param_1];
+; CHECK-NEXT:    div.full.f32 %r3, %r1, %r2;
+; CHECK-NEXT:    mov.b32 %r4, 0f40400000;
+; CHECK-NEXT:    div.full.f32 %r5, %r3, %r4;
+; CHECK-NEXT:    div.full.ftz.f32 %r6, %r5, %r2;
+; CHECK-NEXT:    mov.b32 %r7, 0f40800000;
+; CHECK-NEXT:    div.full.ftz.f32 %r8, %r6, %r7;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r8;
 ; CHECK-NEXT:    ret;
   %1 = call float @llvm.nvvm.div.full(float %a, float %b)
   %2 = call float @llvm.nvvm.div.full(float %1, float 3.0)
diff --git a/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll b/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll
index 28bef0de48166..b73aea76a4528 100644
--- a/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll
+++ b/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll
@@ -66,8 +66,7 @@ define i32 @test_dynamic_stackalloc(i64 %n) {
 define float @test_dynamic_stackalloc_unaligned(i64 %0) {
 ; CHECK-32-LABEL: test_dynamic_stackalloc_unaligned(
 ; CHECK-32:       {
-; CHECK-32-NEXT:    .reg .b32 %r<6>;
-; CHECK-32-NEXT:    .reg .b32 %f<2>;
+; CHECK-32-NEXT:    .reg .b32 %r<7>;
 ; CHECK-32-EMPTY:
 ; CHECK-32-NEXT:  // %bb.0:
 ; CHECK-32-NEXT:    ld.param.b32 %r1, [test_dynamic_stackalloc_unaligned_param_0];
@@ -75,13 +74,13 @@ define float @test_dynamic_stackalloc_unaligned(i64 %0) {
 ; CHECK-32-NEXT:    add.s32 %r3, %r2, 7;
 ; CHECK-32-NEXT:    and.b32 %r4, %r3, -8;
 ; CHECK-32-NEXT:    alloca.u32 %r5, %r4, 8;
-; CHECK-32-NEXT:    ld.local.b32 %f1, [%r5];
-; CHECK-32-NEXT:    st.param.b32 [func_retval0], %f1;
+; CHECK-32-NEXT:    ld.local.b32 %r6, [%r5];
+; CHECK-32-NEXT:    st.param.b32 [func_retval0], %r6;
 ; CHECK-32-NEXT:    ret;
 ;
 ; CHECK-64-LABEL: test_dynamic_stackalloc_unaligned(
 ; CHECK-64:       {
-; CHECK-64-NEXT:    .reg .b32 %f<2>;
+; CHECK-64-NEXT:    .reg .b32 %r<2>;
 ; CHECK-64-NEXT:    .reg .b64 %rd<6>;
 ; CHECK-64-EMPTY:
 ; CHECK-64-NEXT:  // %bb.0:
@@ -90,8 +89,8 @@ define float @test_dynamic_stackalloc_unaligned(i64 %0) {
 ; CHECK-64-NEXT:    add.s64 %rd3, %rd2, 7;
 ; CHECK-64-NEXT:    and.b64 %rd4, %rd3, -8;
 ; CHECK-64-NEXT:    alloca.u64 %rd5, %rd4, 8;
-; CHECK-64-NEXT:    ld.local.b32 %f1, [%rd5];
-; CHECK-64-NEXT:    st.param.b32 [func_retval0], %f1;
+; CHECK-64-NEXT:    ld.local.b32 %r1, [%rd5];
+; CHECK-64-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-64-NEXT:    ret;
   %4 = alloca float, i64 %0, align 4
   %5 = getelementptr float, ptr %4, i64 0
diff --git a/llvm/test/CodeGen/NVPTX/f16-abs.ll b/llvm/test/CodeGen/NVPTX/f16-abs.ll
index d3aaedf84bce9..4025b38c0f0e4 100644
--- a/llvm/test/CodeGen/NVPTX/f16-abs.ll
+++ b/llvm/test/CodeGen/NVPTX/f16-abs.ll
@@ -49,13 +49,13 @@ define half @test_fabs(half %a) {
 ; CHECK-NOF16-LABEL: test_fabs(
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .b16 %rs<3>;
-; CHECK-NOF16-NEXT:    .reg .b32 %f<3>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
 ; CHECK-NOF16-NEXT:    ld.param.b16 %rs1, [test_fabs_param_0];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs1;
-; CHECK-NOF16-NEXT:    abs.f32 %f2, %f1;
-; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs2, %f2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r1, %rs1;
+; CHECK-NOF16-NEXT:    abs.f32 %r2, %r1;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs2, %r2;
 ; CHECK-NOF16-NEXT:    st.param.b16 [func_retval0], %rs2;
 ; CHECK-NOF16-NEXT:    ret;
 ;
diff --git a/llvm/test/CodeGen/NVPTX/f16-instructions.ll b/llvm/test/CodeGen/NVPTX/f16-instructions.ll
index 23fab22057869..40f6557bbe1a2 100644
--- a/llvm/test/CodeGen/NVPTX/f16-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f16-instructions.ll
@@ -55,9 +55,9 @@ define half @test_ret_const() #0 {
 ; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_fadd_param_1];
 ; CHECK-F16-NOFTZ-NEXT:   add.rn.f16     [[R:%rs[0-9]+]], [[A]], [[B]];
 ; CHECK-F16-FTZ-NEXT:   add.rn.ftz.f16     [[R:%rs[0-9]+]], [[A]], [[B]];
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[A32:%f[0-9]+]], [[A]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%f[0-9]+]], [[B]]
-; CHECK-NOF16-NEXT: add.rn.f32     [[R32:%f[0-9]+]], [[A32]], [[B32]];
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[A32:%r[0-9]+]], [[A]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%r[0-9]+]], [[B]]
+; CHECK-NOF16-NEXT: add.rn.f32     [[R32:%r[0-9]+]], [[A32]], [[B32]];
 ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[R32]]
 ; CHECK-NEXT: st.param.b16    [func_retval0], [[R]];
 ; CHECK-NEXT: ret;
@@ -71,9 +71,9 @@ define half @test_fadd(half %a, half %b) #0 {
 ; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_fadd_v1f16_param_1];
 ; CHECK-F16-NOFTZ-NEXT:   add.rn.f16     [[R:%rs[0-9]+]], [[A]], [[B]];
 ; CHECK-F16-FTZ-NEXT:   add.rn.ftz.f16     [[R:%rs[0-9]+]], [[A]], [[B]];
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[A32:%f[0-9]+]], [[A]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%f[0-9]+]], [[B]]
-; CHECK-NOF16-NEXT: add.rn.f32     [[R32:%f[0-9]+]], [[A32]], [[B32]];
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[A32:%r[0-9]+]], [[A]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%r[0-9]+]], [[B]]
+; CHECK-NOF16-NEXT: add.rn.f32     [[R32:%r[0-9]+]], [[A32]], [[B32]];
 ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[R32]]
 ; CHECK-NEXT: st.param.b16    [func_retval0], [[R]];
 ; CHECK-NEXT: ret;
@@ -89,8 +89,8 @@ define <1 x half> @test_fadd_v1f16(<1 x half> %a, <1 x half> %b) #0 {
 ; CHECK-F16-NOFTZ-NEXT:   add.rn.f16     [[R:%rs[0-9]+]], [[B]], [[A]];
 ; CHECK-F16-FTZ-DAG:    mov.b16        [[A:%rs[0-9]+]], 0x3C00;
 ; CHECK-F16-FTZ-NEXT:   add.rn.ftz.f16     [[R:%rs[0-9]+]], [[B]], [[A]];
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%f[0-9]+]], [[B]]
-; CHECK-NOF16-NEXT: add.rn.f32     [[R32:%f[0-9]+]], [[B32]], 0f3F800000;
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%r[0-9]+]], [[B]]
+; CHECK-NOF16-NEXT: add.rn.f32     [[R32:%r[0-9]+]], [[B32]], 0f3F800000;
 ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[R32]]
 ; CHECK-NEXT: st.param.b16    [func_retval0], [[R]];
 ; CHECK-NEXT: ret;
@@ -105,8 +105,8 @@ define half @test_fadd_imm_0(half %b) #0 {
 ; CHECK-F16-NOFTZ-NEXT:   add.rn.f16     [[R:%rs[0-9]+]], [[B]], [[A]];
 ; CHECK-F16-FTZ-DAG:    mov.b16        [[A:%rs[0-9]+]], 0x3C00;
 ; CHECK-F16-FTZ-NEXT:   add.rn.ftz.f16     [[R:%rs[0-9]+]], [[B]], [[A]];
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%f[0-9]+]], [[B]]
-; CHECK-NOF16-NEXT: add.rn.f32     [[R32:%f[0-9]+]], [[B32]], 0f3F800000;
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%r[0-9]+]], [[B]]
+; CHECK-NOF16-NEXT: add.rn.f32     [[R32:%r[0-9]+]], [[B32]], 0f3F800000;
 ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[R32]]
 ; CHECK-NEXT: st.param.b16    [func_retval0], [[R]];
 ; CHECK-NEXT: ret;
@@ -120,9 +120,9 @@ define half @test_fadd_imm_1(half %a) #0 {
 ; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_fsub_param_1];
 ; CHECK-F16-NOFTZ-NEXT:   sub.rn.f16     [[R:%rs[0-9]+]], [[A]], [[B]];
 ; CHECK-F16-FTZ-NEXT:   sub.rn.ftz.f16     [[R:%rs[0-9]+]], [[A]], [[B]];
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[A32:%f[0-9]+]], [[A]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%f[0-9]+]], [[B]]
-; CHECK-NOF16-NEXT: sub.rn.f32     [[R32:%f[0-9]+]], [[A32]], [[B32]];
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[A32:%r[0-9]+]], [[A]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%r[0-9]+]], [[B]]
+; CHECK-NOF16-NEXT: sub.rn.f32     [[R32:%r[0-9]+]], [[A32]], [[B32]];
 ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[R32]]
 ; CHECK-NEXT: st.param.b16    [func_retval0], [[R]];
 ; CHECK-NEXT: ret;
@@ -137,9 +137,9 @@ define half @test_fsub(half %a, half %b) #0 {
 ; CHECK-F16-NOFTZ-NEXT:   sub.rn.f16     [[R:%rs[0-9]+]], [[Z]], [[A]];
 ; CHECK-F16-FTZ-NEXT:   mov.b16        [[Z:%rs[0-9]+]], 0x0000
 ; CHECK-F16-FTZ-NEXT:   sub.rn.ftz.f16     [[R:%rs[0-9]+]], [[Z]], [[A]];
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[A32:%f[0-9]+]], [[A]]
-; CHECK-NOF16-DAG:  mov.b32        [[Z:%f[0-9]+]], 0f00000000;
-; CHECK-NOF16-NEXT: sub.rn.f32     [[R32:%f[0-9]+]], [[Z]], [[A32]];
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[A32:%r[0-9]+]], [[A]]
+; CHECK-NOF16-DAG:  mov.b32        [[Z:%r[0-9]+]], 0f00000000;
+; CHECK-NOF16-NEXT: sub.rn.f32     [[R32:%r[0-9]+]], [[Z]], [[A32]];
 ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[R32]]
 ; CHECK-NEXT: st.param.b16    [func_retval0], [[R]];
 ; CHECK-NEXT: ret;
@@ -165,9 +165,9 @@ define half @test_fneg(half %a) #0 {
 ; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_fmul_param_1];
 ; CHECK-F16-NOFTZ-NEXT: mul.rn.f16      [[R:%rs[0-9]+]], [[A]], [[B]];
 ; CHECK-F16-FTZ-NEXT: mul.rn.ftz.f16      [[R:%rs[0-9]+]], [[A]], [[B]];
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[A32:%f[0-9]+]], [[A]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%f[0-9]+]], [[B]]
-; CHECK-NOF16-NEXT: mul.rn.f32     [[R32:%f[0-9]+]], [[A32]], [[B32]];
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[A32:%r[0-9]+]], [[A]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%r[0-9]+]], [[B]]
+; CHECK-NOF16-NEXT: mul.rn.f32     [[R32:%r[0-9]+]], [[A32]], [[B32]];
 ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[R32]]
 ; CHECK-NEXT: st.param.b16    [func_retval0], [[R]];
 ; CHECK-NEXT: ret;
@@ -179,12 +179,12 @@ define half @test_fmul(half %a, half %b) #0 {
 ; CHECK-LABEL: test_fdiv(
 ; CHECK-DAG:  ld.param.b16    [[A:%rs[0-9]+]], [test_fdiv_param_0];
 ; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_fdiv_param_1];
-; CHECK-NOFTZ-DAG:  cvt.f32.f16     [[F0:%f[0-9]+]], [[A]];
-; CHECK-NOFTZ-DAG:  cvt.f32.f16     [[F1:%f[0-9]+]], [[B]];
-; CHECK-NOFTZ-NEXT: div.rn.f32      [[FR:%f[0-9]+]], [[F0]], [[F1]];
-; CHECK-F16-FTZ-DAG:  cvt.ftz.f32.f16     [[F0:%f[0-9]+]], [[A]];
-; CHECK-F16-FTZ-DAG:  cvt.ftz.f32.f16     [[F1:%f[0-9]+]], [[B]];
-; CHECK-F16-FTZ-NEXT: div.rn.ftz.f32      [[FR:%f[0-9]+]], [[F0]], [[F1]];
+; CHECK-NOFTZ-DAG:  cvt.f32.f16     [[F0:%r[0-9]+]], [[A]];
+; CHECK-NOFTZ-DAG:  cvt.f32.f16     [[F1:%r[0-9]+]], [[B]];
+; CHECK-NOFTZ-NEXT: div.rn.f32      [[FR:%r[0-9]+]], [[F0]], [[F1]];
+; CHECK-F16-FTZ-DAG:  cvt.ftz.f32.f16     [[F0:%r[0-9]+]], [[A]];
+; CHECK-F16-FTZ-DAG:  cvt.ftz.f32.f16     [[F1:%r[0-9]+]], [[B]];
+; CHECK-F16-FTZ-NEXT: div.rn.ftz.f32      [[FR:%r[0-9]+]], [[F0]], [[F1]];
 ; CHECK-NEXT: cvt.rn.f16.f32  [[R:%rs[0-9]+]], [[FR]];
 ; CHECK-NEXT: st.param.b16    [func_retval0], [[R]];
 ; CHECK-NEXT: ret;
@@ -196,20 +196,20 @@ define half @test_fdiv(half %a, half %b) #0 {
 ; CHECK-LABEL: test_frem(
 ; CHECK-DAG:  ld.param.b16    [[A:%rs[0-9]+]], [test_frem_param_0];
 ; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_frem_param_1];
-; CHECK-NOFTZ-DAG:  cvt.f32.f16     [[FA:%f[0-9]+]], [[A]];
-; CHECK-NOFTZ-DAG:  cvt.f32.f16     [[FB:%f[0-9]+]], [[B]];
-; CHECK-NOFTZ-NEXT: div.rn.f32      [[D:%f[0-9]+]], [[FA]], [[FB]];
-; CHECK-NOFTZ-NEXT: cvt.rzi.f32.f32 [[DI:%f[0-9]+]], [[D]];
-; CHECK-NOFTZ-NEXT: neg.f32         [[DNEG:%f[0-9]+]], [[DI]];
-; CHECK-NOFTZ-NEXT: fma.rn.f32      [[RF:%f[0-9]+]], [[DNEG]], [[FB]], [[FA]];
-; CHECK-F16-FTZ-DAG:  cvt.ftz.f32.f16     [[FA:%f[0-9]+]], [[A]];
-; CHECK-F16-FTZ-DAG:  cvt.ftz.f32.f16     [[FB:%f[0-9]+]], [[B]];
-; CHECK-F16-FTZ-NEXT: div.rn.ftz.f32      [[D:%f[0-9]+]], [[FA]], [[FB]];
-; CHECK-F16-FTZ-NEXT: cvt.rzi.ftz.f32.f32 [[DI:%f[0-9]+]], [[D]];
-; CHECK-F16-FTZ-NEXT: neg.ftz.f32         [[DNEG:%f[0-9]+]], [[DI]];
-; CHECK-F16-FTZ-NEXT: fma.rn.ftz.f32      [[RF:%f[0-9]+]], [[DNEG]], [[FB]], [[FA]];
+; CHECK-NOFTZ-DAG:  cvt.f32.f16     [[FA:%r[0-9]+]], [[A]];
+; CHECK-NOFTZ-DAG:  cvt.f32.f16     [[FB:%r[0-9]+]], [[B]];
+; CHECK-NOFTZ-NEXT: div.rn.f32      [[D:%r[0-9]+]], [[FA]], [[FB]];
+; CHECK-NOFTZ-NEXT: cvt.rzi.f32.f32 [[DI:%r[0-9]+]], [[D]];
+; CHECK-NOFTZ-NEXT: neg.f32         [[DNEG:%r[0-9]+]], [[DI]];
+; CHECK-NOFTZ-NEXT: fma.rn.f32      [[RF:%r[0-9]+]], [[DNEG]], [[FB]], [[FA]];
+; CHECK-F16-FTZ-DAG:  cvt.ftz.f32.f16     [[FA:%r[0-9]+]], [[A]];
+; CHECK-F16-FTZ-DAG:  cvt.ftz.f32.f16     [[FB:%r[0-9]+]], [[B]];
+; CHECK-F16-FTZ-NEXT: div.rn.ftz.f32      [[D:%r[0-9]+]], [[FA]], [[FB]];
+; CHECK-F16-FTZ-NEXT: cvt.rzi.ftz.f32.f32 [[DI:%r[0-9]+]], [[D]];
+; CHECK-F16-FTZ-NEXT: neg.ftz.f32         [[DNEG:%r[0-9]+]], [[DI]];
+; CHECK-F16-FTZ-NEXT: fma.rn.ftz.f32      [[RF:%r[0-9]+]], [[DNEG]], [[FB]], [[FA]];
 ; CHECK-NEXT: testp.infinite.f32 [[ISBINF:%p[0-9]+]], [[FB]];
-; CHECK-NEXT: selp.f32           [[RESULT:%f[0-9]+]], [[FA]], [[RF]], [[ISBINF]];
+; CHECK-NEXT: selp.f32           [[RESULT:%r[0-9]+]], [[FA]], [[RF]], [[ISBINF]];
 ; CHECK-NEXT: cvt.rn.f16.f32     [[R:%rs[0-9]+]], [[RESULT]];
 ; CHECK-NEXT: st.param.b16       [func_retval0], [[R]];
 ; CHECK-NEXT: ret;
@@ -246,8 +246,8 @@ define half @test_load(ptr %a) #0 {
 ; CHECK-DAG: ld.b8        [[B1:%r[sd]?[0-9]+]], [%[[FROM]]+1]
 ; CHECK-DAG: st.b8        [%[[TO]]+1], [[B1]]
 ; CHECK: ret
-define void @test_halfp0a1(ptr noalias readonly %from, ptr %to) {
-  %1 = load half, ptr %from , align 1
+define void @test_halfp0a1(ptr noalias readonly %rrom, ptr %to) {
+  %1 = load half, ptr %rrom , align 1
   store half %1, ptr %to , align 1
   ret void
 }
@@ -344,8 +344,8 @@ define half @test_select(half %a, half %b, i1 zeroext %c) #0 {
 ; CHECK-DAG:  ld.param.b16    [[C:%rs[0-9]+]], [test_select_cc_param_2];
 ; CHECK-DAG:  ld.param.b16    [[D:%rs[0-9]+]], [test_select_cc_param_3];
 ; CHECK-F16-NOFTZ:  setp.neu.f16    [[PRED:%p[0-9]+]], [[C]], [[D]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[DF:%f[0-9]+]], [[D]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[CF:%f[0-9]+]], [[C]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[DF:%r[0-9]+]], [[D]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[CF:%r[0-9]+]], [[C]];
 ; CHECK-NOF16: setp.neu.f32    [[PRED:%p[0-9]+]], [[CF]], [[DF]]
 ; CHECK:      selp.b16        [[R:%rs[0-9]+]], [[A]], [[B]], [[PRED]];
 ; CHECK-NEXT: st.param.b16    [func_retval0], [[R]];
@@ -357,16 +357,16 @@ define half @test_select_cc(half %a, half %b, half %c, half %d) #0 {
 }
 
 ; CHECK-LABEL: test_select_cc_f32_f16(
-; CHECK-DAG:  ld.param.b32    [[A:%f[0-9]+]], [test_select_cc_f32_f16_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%f[0-9]+]], [test_select_cc_f32_f16_param_1];
+; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_select_cc_f32_f16_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_select_cc_f32_f16_param_1];
 ; CHECK-DAG:  ld.param.b16    [[C:%rs[0-9]+]], [test_select_cc_f32_f16_param_2];
 ; CHECK-DAG:  ld.param.b16    [[D:%rs[0-9]+]], [test_select_cc_f32_f16_param_3];
 ; CHECK-F16-NOFTZ:  setp.neu.f16    [[PRED:%p[0-9]+]], [[C]], [[D]]
 ; CHECK-F16-FTZ:  setp.neu.ftz.f16    [[PRED:%p[0-9]+]], [[C]], [[D]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[DF:%f[0-9]+]], [[D]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[CF:%f[0-9]+]], [[C]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[DF:%r[0-9]+]], [[D]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[CF:%r[0-9]+]], [[C]];
 ; CHECK-NOF16: setp.neu.f32    [[PRED:%p[0-9]+]], [[CF]], [[DF]]
-; CHECK-NEXT: selp.f32        [[R:%f[0-9]+]], [[A]], [[B]], [[PRED]];
+; CHECK-NEXT: selp.f32        [[R:%r[0-9]+]], [[A]], [[B]], [[PRED]];
 ; CHECK-NEXT: st.param.b32    [func_retval0], [[R]];
 ; CHECK-NEXT: ret;
 define float @test_select_cc_f32_f16(float %a, float %b, half %c, half %d) #0 {
@@ -377,8 +377,8 @@ define float @test_select_cc_f32_f16(float %a, float %b, half %c, half %d) #0 {
 
 ; CHECK-LABEL: test_select_cc_f16_f32(
 ; CHECK-DAG:  ld.param.b16    [[A:%rs[0-9]+]], [test_select_cc_f16_f32_param_0];
-; CHECK-DAG:  ld.param.b32    [[C:%f[0-9]+]], [test_select_cc_f16_f32_param_2];
-; CHECK-DAG:  ld.param.b32    [[D:%f[0-9]+]], [test_select_cc_f16_f32_param_3];
+; CHECK-DAG:  ld.param.b32    [[C:%r[0-9]+]], [test_select_cc_f16_f32_param_2];
+; CHECK-DAG:  ld.param.b32    [[D:%r[0-9]+]], [test_select_cc_f16_f32_param_3];
 ; CHECK-NOFTZ-DAG:  setp.neu.f32    [[PRED:%p[0-9]+]], [[C]], [[D]]
 ; CHECK-F16-FTZ-DAG:  setp.neu.ftz.f32    [[PRED:%p[0-9]+]], [[C]], [[D]]
 ; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_select_cc_f16_f32_param_1];
@@ -396,8 +396,8 @@ define half @test_select_cc_f16_f32(half %a, half %b, float %c, float %d) #0 {
 ; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_fcmp_une_param_1];
 ; CHECK-F16-NOFTZ:  setp.neu.f16    [[PRED:%p[0-9]+]], [[A]], [[B]]
 ; CHECK-F16-FTZ:  setp.neu.ftz.f16    [[PRED:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%r[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%r[0-9]+]], [[B]];
 ; CHECK-NOF16: setp.neu.f32   [[PRED:%p[0-9]+]], [[AF]], [[BF]]
 ; CHECK-NEXT: selp.b32        [[R:%r[0-9]+]], 1, 0, [[PRED]];
 ; CHECK-NEXT: st.param.b32    [func_retval0], [[R]];
@@ -412,8 +412,8 @@ define i1 @test_fcmp_une(half %a, half %b) #0 {
 ; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_fcmp_ueq_param_1];
 ; CHECK-F16-NOFTZ:  setp.equ.f16    [[PRED:%p[0-9]+]], [[A]], [[B]]
 ; CHECK-F16-FTZ:  setp.equ.ftz.f16    [[PRED:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%r[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%r[0-9]+]], [[B]];
 ; CHECK-NOF16: setp.equ.f32   [[PRED:%p[0-9]+]], [[AF]], [[BF]]
 ; CHECK-NEXT: selp.b32        [[R:%r[0-9]+]], 1, 0, [[PRED]];
 ; CHECK-NEXT: st.param.b32    [func_retval0], [[R]];
@@ -428,8 +428,8 @@ define i1 @test_fcmp_ueq(half %a, half %b) #0 {
 ; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_fcmp_ugt_param_1];
 ; CHECK-F16-NOFTZ:  setp.gtu.f16    [[PRED:%p[0-9]+]], [[A]], [[B]]
 ; CHECK-F16-FTZ:  setp.gtu.ftz.f16    [[PRED:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%r[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%r[0-9]+]], [[B]];
 ; CHECK-NOF16: setp.gtu.f32   [[PRED:%p[0-9]+]], [[AF]], [[BF]]
 ; CHECK-NEXT: selp.b32        [[R:%r[0-9]+]], 1, 0, [[PRED]];
 ; CHECK-NEXT: st.param.b32    [func_retval0], [[R]];
@@ -444,8 +444,8 @@ define i1 @test_fcmp_ugt(half %a, half %b) #0 {
 ; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_fcmp_uge_param_1];
 ; CHECK-F16-NOFTZ:  setp.geu.f16    [[PRED:%p[0-9]+]], [[A]], [[B]]
 ; CHECK-F16-FTZ:  setp.geu.ftz.f16    [[PRED:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%r[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%r[0-9]+]], [[B]];
 ; CHECK-NOF16: setp.geu.f32   [[PRED:%p[0-9]+]], [[AF]], [[BF]]
 ; CHECK-NEXT: selp.b32        [[R:%r[0-9]+]], 1, 0, [[PRED]];
 ; CHECK-NEXT: st.param.b32    [func_retval0], [[R]];
@@ -460,8 +460,8 @@ define i1 @test_fcmp_uge(half %a, half %b) #0 {
 ; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_fcmp_ult_param_1];
 ; CHECK-F16-NOFTZ:  setp.ltu.f16    [[PRED:%p[0-9]+]], [[A]], [[B]]
 ; CHECK-F16-FTZ:  setp.ltu.ftz.f16    [[PRED:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%r[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%r[0-9]+]], [[B]];
 ; CHECK-NOF16: setp.ltu.f32   [[PRED:%p[0-9]+]], [[AF]], [[BF]]
 ; CHECK-NEXT: selp.b32        [[R:%r[0-9]+]], 1, 0, [[PRED]];
 ; CHECK-NEXT: st.param.b32    [func_retval0], [[R]];
@@ -476,8 +476,8 @@ define i1 @test_fcmp_ult(half %a, half %b) #0 {
 ; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_fcmp_ule_param_1];
 ; CHECK-F16-NOFTZ:  setp.leu.f16    [[PRED:%p[0-9]+]], [[A]], [[B]]
 ; CHECK-F16-FTZ:  setp.leu.ftz.f16    [[PRED:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%r[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%r[0-9]+]], [[B]];
 ; CHECK-NOF16: setp.leu.f32   [[PRED:%p[0-9]+]], [[AF]], [[BF]]
 ; CHECK-NEXT: selp.b32        [[R:%r[0-9]+]], 1, 0, [[PRED]];
 ; CHECK-NEXT: st.param.b32    [func_retval0], [[R]];
@@ -493,8 +493,8 @@ define i1 @test_fcmp_ule(half %a, half %b) #0 {
 ; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_fcmp_uno_param_1];
 ; CHECK-F16-NOFTZ:  setp.nan.f16    [[PRED:%p[0-9]+]], [[A]], [[B]]
 ; CHECK-F16-FTZ:  setp.nan.ftz.f16    [[PRED:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%r[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%r[0-9]+]], [[B]];
 ; CHECK-NOF16: setp.nan.f32   [[PRED:%p[0-9]+]], [[AF]], [[BF]]
 ; CHECK-NEXT: selp.b32        [[R:%r[0-9]+]], 1, 0, [[PRED]];
 ; CHECK-NEXT: st.param.b32    [func_retval0], [[R]];
@@ -509,8 +509,8 @@ define i1 @test_fcmp_uno(half %a, half %b) #0 {
 ; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_fcmp_one_param_1];
 ; CHECK-F16-NOFTZ:  setp.ne.f16     [[PRED:%p[0-9]+]], [[A]], [[B]]
 ; CHECK-F16-FTZ:  setp.ne.ftz.f16     [[PRED:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%r[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%r[0-9]+]], [[B]];
 ; CHECK-NOF16: setp.ne.f32    [[PRED:%p[0-9]+]], [[AF]], [[BF]]
 ; CHECK-NEXT: selp.b32        [[R:%r[0-9]+]], 1, 0, [[PRED]];
 ; CHECK-NEXT: st.param.b32    [func_retval0], [[R]];
@@ -525,8 +525,8 @@ define i1 @test_fcmp_one(half %a, half %b) #0 {
 ; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_fcmp_oeq_param_1];
 ; CHECK-F16-NOFTZ:  setp.eq.f16     [[PRED:%p[0-9]+]], [[A]], [[B]]
 ; CHECK-F16-FTZ:  setp.eq.ftz.f16     [[PRED:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%r[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%r[0-9]+]], [[B]];
 ; CHECK-NOF16: setp.eq.f32    [[PRED:%p[0-9]+]], [[AF]], [[BF]]
 ; CHECK-NEXT: selp.b32        [[R:%r[0-9]+]], 1, 0, [[PRED]];
 ; CHECK-NEXT: st.param.b32    [func_retval0], [[R]];
@@ -541,8 +541,8 @@ define i1 @test_fcmp_oeq(half %a, half %b) #0 {
 ; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_fcmp_ogt_param_1];
 ; CHECK-F16-NOFTZ:  setp.gt.f16     [[PRED:%p[0-9]+]], [[A]], [[B]]
 ; CHECK-F16-FTZ:  setp.gt.ftz.f16     [[PRED:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%r[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%r[0-9]+]], [[B]];
 ; CHECK-NOF16: setp.gt.f32    [[PRED:%p[0-9]+]], [[AF]], [[BF]]
 ; CHECK-NEXT: selp.b32        [[R:%r[0-9]+]], 1, 0, [[PRED]];
 ; CHECK-NEXT: st.param.b32    [func_retval0], [[R]];
@@ -557,8 +557,8 @@ define i1 @test_fcmp_ogt(half %a, half %b) #0 {
 ; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_fcmp_oge_param_1];
 ; CHECK-F16-NOFTZ:  setp.ge.f16     [[PRED:%p[0-9]+]], [[A]], [[B]]
 ; CHECK-F16-FTZ:  setp.ge.ftz.f16     [[PRED:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%r[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%r[0-9]+]], [[B]];
 ; CHECK-NOF16: setp.ge.f32    [[PRED:%p[0-9]+]], [[AF]], [[BF]]
 ; CHECK-NEXT: selp.b32        [[R:%r[0-9]+]], 1, 0, [[PRED]];
 ; CHECK-NEXT: st.param.b32    [func_retval0], [[R]];
@@ -573,8 +573,8 @@ define i1 @test_fcmp_oge(half %a, half %b) #0 {
 ; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_fcmp_olt_param_1];
 ; CHECK-F16-NOFTZ:  setp.lt.f16     [[PRED:%p[0-9]+]], [[A]], [[B]]
 ; CHECK-F16-FTZ:  setp.lt.ftz.f16     [[PRED:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%r[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%r[0-9]+]], [[B]];
 ; CHECK-NOF16: setp.lt.f32    [[PRED:%p[0-9]+]], [[AF]], [[BF]]
 ; CHECK-NEXT: selp.b32        [[R:%r[0-9]+]], 1, 0, [[PRED]];
 ; CHECK-NEXT: st.param.b32    [func_retval0], [[R]];
@@ -589,8 +589,8 @@ define i1 @test_fcmp_olt(half %a, half %b) #0 {
 ; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_fcmp_ole_param_1];
 ; CHECK-F16-NOFTZ:  setp.le.f16     [[PRED:%p[0-9]+]], [[A]], [[B]]
 ; CHECK-F16-FTZ:  setp.le.ftz.f16     [[PRED:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%r[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%r[0-9]+]], [[B]];
 ; CHECK-NOF16: setp.le.f32    [[PRED:%p[0-9]+]], [[AF]], [[BF]]
 ; CHECK-NEXT: selp.b32        [[R:%r[0-9]+]], 1, 0, [[PRED]];
 ; CHECK-NEXT: st.param.b32    [func_retval0], [[R]];
@@ -605,8 +605,8 @@ define i1 @test_fcmp_ole(half %a, half %b) #0 {
 ; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_fcmp_ord_param_1];
 ; CHECK-F16-NOFTZ:  setp.num.f16    [[PRED:%p[0-9]+]], [[A]], [[B]]
 ; CHECK-F16-FTZ:  setp.num.ftz.f16    [[PRED:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%r[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%r[0-9]+]], [[B]];
 ; CHECK-NOF16: setp.num.f32   [[PRED:%p[0-9]+]], [[AF]], [[BF]]
 ; CHECK-NEXT: selp.b32        [[R:%r[0-9]+]], 1, 0, [[PRED]];
 ; CHECK-NEXT: st.param.b32    [func_retval0], [[R]];
@@ -623,8 +623,8 @@ define i1 @test_fcmp_ord(half %a, half %b) #0 {
 ; CHECK-DAG:  ld.param.b64    %[[D:rd[0-9]+]], [test_br_cc_param_3];
 ; CHECK-F16-NOFTZ:  setp.lt.f16     [[PRED:%p[0-9]+]], [[A]], [[B]]
 ; CHECK-F16-FTZ:  setp.lt.ftz.f16     [[PRED:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%r[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%r[0-9]+]], [[B]];
 ; CHECK-NOF16: setp.lt.f32    [[PRED:%p[0-9]+]], [[AF]], [[BF]]
 ; CHECK-NEXT: @[[PRED]] bra   [[LABEL:\$L__BB.*]];
 ; CHECK:      st.b32  [%[[C]]],
@@ -757,9 +757,9 @@ define half @test_sitofp_i64(i64 %a) #0 {
 ; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_uitofp_i32_fadd_param_1];
 ; CHECK-F16-NOFTZ:       add.rn.f16      [[R:%rs[0-9]+]], [[B]], [[C]];
 ; CHECK-F16-FTZ:       add.rn.ftz.f16      [[R:%rs[0-9]+]], [[B]], [[C]];
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%f[0-9]+]], [[B]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[C32:%f[0-9]+]], [[C]]
-; CHECK-NOF16-NEXT: add.rn.f32     [[R32:%f[0-9]+]], [[B32]], [[C32]];
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%r[0-9]+]], [[B]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[C32:%r[0-9]+]], [[C]]
+; CHECK-NOF16-NEXT: add.rn.f32     [[R32:%r[0-9]+]], [[B32]], [[C32]];
 ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[R32]]
 ; CHECK:      st.param.b16    [func_retval0], [[R]];
 ; CHECK:      ret;
@@ -775,9 +775,9 @@ define half @test_uitofp_i32_fadd(i32 %a, half %b) #0 {
 ; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_sitofp_i32_fadd_param_1];
 ; CHECK-F16-NOFTZ:         add.rn.f16     [[R:%rs[0-9]+]], [[B]], [[C]];
 ; CHECK-F16-FTZ:         add.rn.ftz.f16     [[R:%rs[0-9]+]], [[B]], [[C]];
-; XCHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%f[0-9]+]], [[B]]
-; XCHECK-NOF16-DAG:  cvt.f32.f16    [[C32:%f[0-9]+]], [[C]]
-; XCHECK-NOF16-NEXT: add.rn.f32     [[R32:%f[0-9]+]], [[B32]], [[C32]];
+; XCHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%r[0-9]+]], [[B]]
+; XCHECK-NOF16-DAG:  cvt.f32.f16    [[C32:%r[0-9]+]], [[C]]
+; XCHECK-NOF16-NEXT: add.rn.f32     [[R32:%r[0-9]+]], [[B32]], [[C32]];
 ; XCHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[R32]]
 ; CHECK:      st.param.b16    [func_retval0], [[R]];
 ; CHECK:      ret;
@@ -788,7 +788,7 @@ define half @test_sitofp_i32_fadd(i32 %a, half %b) #0 {
 }
 
 ; CHECK-LABEL: test_fptrunc_float(
-; CHECK:      ld.param.b32    [[A:%f[0-9]+]], [test_fptrunc_float_param_0];
+; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_fptrunc_float_param_0];
 ; CHECK:      cvt.rn.f16.f32  [[R:%rs[0-9]+]], [[A]];
 ; CHECK:      st.param.b16    [func_retval0], [[R]];
 ; CHECK:      ret;
@@ -798,7 +798,7 @@ define half @test_fptrunc_float(float %a) #0 {
 }
 
 ; CHECK-LABEL: test_fptrunc_double(
-; CHECK:      ld.param.b64    [[A:%fd[0-9]+]], [test_fptrunc_double_param_0];
+; CHECK:      ld.param.b64    [[A:%rd[0-9]+]], [test_fptrunc_double_param_0];
 ; CHECK:      cvt.rn.f16.f64  [[R:%rs[0-9]+]], [[A]];
 ; CHECK:      st.param.b16    [func_retval0], [[R]];
 ; CHECK:      ret;
@@ -809,8 +809,8 @@ define half @test_fptrunc_double(double %a) #0 {
 
 ; CHECK-LABEL: test_fpext_float(
 ; CHECK:      ld.param.b16    [[A:%rs[0-9]+]], [test_fpext_float_param_0];
-; CHECK-NOFTZ:      cvt.f32.f16     [[R:%f[0-9]+]], [[A]];
-; CHECK-F16-FTZ:      cvt.ftz.f32.f16     [[R:%f[0-9]+]], [[A]];
+; CHECK-NOFTZ:      cvt.f32.f16     [[R:%r[0-9]+]], [[A]];
+; CHECK-F16-FTZ:      cvt.ftz.f32.f16     [[R:%r[0-9]+]], [[A]];
 ; CHECK:      st.param.b32    [func_retval0], [[R]];
 ; CHECK:      ret;
 define float @test_fpext_float(half %a) #0 {
@@ -820,7 +820,7 @@ define float @test_fpext_float(half %a) #0 {
 
 ; CHECK-LABEL: test_fpext_double(
 ; CHECK:      ld.param.b16    [[A:%rs[0-9]+]], [test_fpext_double_param_0];
-; CHECK:      cvt.f64.f16     [[R:%fd[0-9]+]], [[A]];
+; CHECK:      cvt.f64.f16     [[R:%rd[0-9]+]], [[A]];
 ; CHECK:      st.param.b64    [func_retval0], [[R]];
 ; CHECK:      ret;
 define double @test_fpext_double(half %a) #0 {
@@ -875,10 +875,10 @@ declare half @llvm.fmuladd.f16(half %a, half %b, half %c) #0
 
 ; CHECK-LABEL: test_sqrt(
 ; CHECK:      ld.param.b16    [[A:%rs[0-9]+]], [test_sqrt_param_0];
-; CHECK-NOFTZ:      cvt.f32.f16     [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOFTZ:      sqrt.rn.f32     [[RF:%f[0-9]+]], [[AF]];
-; CHECK-F16-FTZ:      cvt.ftz.f32.f16     [[AF:%f[0-9]+]], [[A]];
-; CHECK-F16-FTZ:      sqrt.rn.ftz.f32     [[RF:%f[0-9]+]], [[AF]];
+; CHECK-NOFTZ:      cvt.f32.f16     [[AF:%r[0-9]+]], [[A]];
+; CHECK-NOFTZ:      sqrt.rn.f32     [[RF:%r[0-9]+]], [[AF]];
+; CHECK-F16-FTZ:      cvt.ftz.f32.f16     [[AF:%r[0-9]+]], [[A]];
+; CHECK-F16-FTZ:      sqrt.rn.ftz.f32     [[RF:%r[0-9]+]], [[AF]];
 ; CHECK:      cvt.rn.f16.f32  [[R:%rs[0-9]+]], [[RF]];
 ; CHECK:      st.param.b16    [func_retval0], [[R]];
 ; CHECK:      ret;
@@ -896,9 +896,9 @@ define half @test_sqrt(half %a) #0 {
 
 ; CHECK-LABEL: test_sin(
 ; CHECK:      ld.param.b16    [[A:%rs[0-9]+]], [test_sin_param_0];
-; CHECK-NOFTZ:      cvt.f32.f16     [[AF:%f[0-9]+]], [[A]];
-; CHECK-F16-FTZ:      cvt.ftz.f32.f16     [[AF:%f[0-9]+]], [[A]];
-; CHECK:      sin.approx.f32  [[RF:%f[0-9]+]], [[AF]];
+; CHECK-NOFTZ:      cvt.f32.f16     [[AF:%r[0-9]+]], [[A]];
+; CHECK-F16-FTZ:      cvt.ftz.f32.f16     [[AF:%r[0-9]+]], [[A]];
+; CHECK:      sin.approx.f32  [[RF:%r[0-9]+]], [[AF]];
 ; CHECK:      cvt.rn.f16.f32  [[R:%rs[0-9]+]], [[RF]];
 ; CHECK:      st.param.b16    [func_retval0], [[R]];
 ; CHECK:      ret;
@@ -909,9 +909,9 @@ define half @test_sin(half %a) #0 #1 {
 
 ; CHECK-LABEL: test_cos(
 ; CHECK:      ld.param.b16    [[A:%rs[0-9]+]], [test_cos_param_0];
-; CHECK-NOFTZ:      cvt.f32.f16     [[AF:%f[0-9]+]], [[A]];
-; CHECK-F16-FTZ:      cvt.ftz.f32.f16     [[AF:%f[0-9]+]], [[A]];
-; CHECK:      cos.approx.f32  [[RF:%f[0-9]+]], [[AF]];
+; CHECK-NOFTZ:      cvt.f32.f16     [[AF:%r[0-9]+]], [[A]];
+; CHECK-F16-FTZ:      cvt.ftz.f32.f16     [[AF:%r[0-9]+]], [[A]];
+; CHECK:      cos.approx.f32  [[RF:%r[0-9]+]], [[AF]];
 ; CHECK:      cvt.rn.f16.f32  [[R:%rs[0-9]+]], [[RF]];
 ; CHECK:      st.param.b16    [func_retval0], [[R]];
 ; CHECK:      ret;
@@ -968,10 +968,10 @@ define half @test_cos(half %a) #0 #1 {
 ; CHECK-DAG:  ld.param.b16    [[C:%rs[0-9]+]], [test_fma_param_2];
 ; CHECK-F16-NOFTZ:      fma.rn.f16      [[R:%rs[0-9]+]], [[A]], [[B]], [[C]];
 ; CHECK-F16-FTZ:      fma.rn.ftz.f16      [[R:%rs[0-9]+]], [[A]], [[B]], [[C]];
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[A32:%f[0-9]+]], [[A]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%f[0-9]+]], [[B]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[C32:%f[0-9]+]], [[C]]
-; CHECK-NOF16-NEXT: fma.rn.f32     [[R32:%f[0-9]+]], [[A32]], [[B32]], [[C32]];
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[A32:%r[0-9]+]], [[A]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%r[0-9]+]], [[B]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[C32:%r[0-9]+]], [[C]]
+; CHECK-NOF16-NEXT: fma.rn.f32     [[R32:%r[0-9]+]], [[A32]], [[B32]], [[C32]];
 ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[R32]]
 ; CHECK:      st.param.b16    [func_retval0], [[R]];
 ; CHECK:      ret
@@ -982,10 +982,10 @@ define half @test_fma(half %a, half %b, half %c) #0 {
 
 ; CHECK-LABEL: test_fabs(
 ; CHECK:      ld.param.b16    [[A:%rs[0-9]+]], [test_fabs_param_0];
-; CHECK-NOFTZ:      cvt.f32.f16     [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOFTZ:      abs.f32         [[RF:%f[0-9]+]], [[AF]];
-; CHECK-F16-FTZ:      cvt.ftz.f32.f16     [[AF:%f[0-9]+]], [[A]];
-; CHECK-F16-FTZ:      abs.ftz.f32         [[RF:%f[0-9]+]], [[AF]];
+; CHECK-NOFTZ:      cvt.f32.f16     [[AF:%r[0-9]+]], [[A]];
+; CHECK-NOFTZ:      abs.f32         [[RF:%r[0-9]+]], [[AF]];
+; CHECK-F16-FTZ:      cvt.ftz.f32.f16     [[AF:%r[0-9]+]], [[A]];
+; CHECK-F16-FTZ:      abs.ftz.f32         [[RF:%r[0-9]+]], [[AF]];
 ; CHECK:      cvt.rn.f16.f32  [[R:%rs[0-9]+]], [[RF]];
 ; CHECK:      st.param.b16    [func_retval0], [[R]];
 ; CHECK:      ret;
@@ -997,12 +997,12 @@ define half @test_fabs(half %a) #0 {
 ; CHECK-LABEL: test_minnum(
 ; CHECK-DAG:  ld.param.b16    [[A:%rs[0-9]+]], [test_minnum_param_0];
 ; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_minnum_param_1];
-; CHECK-NOFTZ-DAG:  cvt.f32.f16     [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOFTZ-DAG:  cvt.f32.f16     [[BF:%f[0-9]+]], [[B]];
-; CHECK-NOFTZ:      min.f32         [[RF:%f[0-9]+]], [[AF]], [[BF]];
-; CHECK-F16-FTZ-DAG:  cvt.ftz.f32.f16     [[AF:%f[0-9]+]], [[A]];
-; CHECK-F16-FTZ-DAG:  cvt.ftz.f32.f16     [[BF:%f[0-9]+]], [[B]];
-; CHECK-F16-FTZ:      min.ftz.f32         [[RF:%f[0-9]+]], [[AF]], [[BF]];
+; CHECK-NOFTZ-DAG:  cvt.f32.f16     [[AF:%r[0-9]+]], [[A]];
+; CHECK-NOFTZ-DAG:  cvt.f32.f16     [[BF:%r[0-9]+]], [[B]];
+; CHECK-NOFTZ:      min.f32         [[RF:%r[0-9]+]], [[AF]], [[BF]];
+; CHECK-F16-FTZ-DAG:  cvt.ftz.f32.f16     [[AF:%r[0-9]+]], [[A]];
+; CHECK-F16-FTZ-DAG:  cvt.ftz.f32.f16     [[BF:%r[0-9]+]], [[B]];
+; CHECK-F16-FTZ:      min.ftz.f32         [[RF:%r[0-9]+]], [[AF]], [[BF]];
 ; CHECK:      cvt.rn.f16.f32  [[R:%rs[0-9]+]], [[RF]];
 ; CHECK:      st.param.b16    [func_retval0], [[R]];
 ; CHECK:      ret;
@@ -1014,12 +1014,12 @@ define half @test_minnum(half %a, half %b) #0 {
 ; CHECK-LABEL: test_maxnum(
 ; CHECK-DAG:  ld.param.b16    [[A:%rs[0-9]+]], [test_maxnum_param_0];
 ; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_maxnum_param_1];
-; CHECK-NOFTZ-DAG:  cvt.f32.f16     [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOFTZ-DAG:  cvt.f32.f16     [[BF:%f[0-9]+]], [[B]];
-; CHECK-NOFTZ:      max.f32         [[RF:%f[0-9]+]], [[AF]], [[BF]];
-; CHECK-F16-FTZ-DAG:  cvt.ftz.f32.f16     [[AF:%f[0-9]+]], [[A]];
-; CHECK-F16-FTZ-DAG:  cvt.ftz.f32.f16     [[BF:%f[0-9]+]], [[B]];
-; CHECK-F16-FTZ:      max.ftz.f32         [[RF:%f[0-9]+]], [[AF]], [[BF]];
+; CHECK-NOFTZ-DAG:  cvt.f32.f16     [[AF:%r[0-9]+]], [[A]];
+; CHECK-NOFTZ-DAG:  cvt.f32.f16     [[BF:%r[0-9]+]], [[B]];
+; CHECK-NOFTZ:      max.f32         [[RF:%r[0-9]+]], [[AF]], [[BF]];
+; CHECK-F16-FTZ-DAG:  cvt.ftz.f32.f16     [[AF:%r[0-9]+]], [[A]];
+; CHECK-F16-FTZ-DAG:  cvt.ftz.f32.f16     [[BF:%r[0-9]+]], [[B]];
+; CHECK-F16-FTZ:      max.ftz.f32         [[RF:%r[0-9]+]], [[AF]], [[BF]];
 ; CHECK:      cvt.rn.f16.f32  [[R:%rs[0-9]+]], [[RF]];
 ; CHECK:      st.param.b16    [func_retval0], [[R]];
 ; CHECK:      ret;
@@ -1043,7 +1043,7 @@ define half @test_copysign(half %a, half %b) #0 {
 
 ; CHECK-LABEL: test_copysign_f32(
 ; CHECK-DAG:  ld.param.b16    [[AH:%rs[0-9]+]], [test_copysign_f32_param_0];
-; CHECK-DAG:  ld.param.b32    [[BF:%f[0-9]+]], [test_copysign_f32_param_1];
+; CHECK-DAG:  ld.param.b32    [[BF:%r[0-9]+]], [test_copysign_f32_param_1];
 ; CHECK-DAG:  mov.b32         [[B:%r[0-9]+]], [[BF]];
 ; CHECK-DAG:  and.b16         [[AX:%rs[0-9]+]], [[AH]], 32767;
 ; CHECK-DAG:  and.b32         [[BX0:%r[0-9]+]], [[B]], -2147483648;
@@ -1059,7 +1059,7 @@ define half @test_copysign_f32(half %a, float %b) #0 {
 
 ; CHECK-LABEL: test_copysign_f64(
 ; CHECK-DAG:  ld.param.b16    [[AH:%rs[0-9]+]], [test_copysign_f64_param_0];
-; CHECK-DAG:  ld.param.b64    [[BD:%fd[0-9]+]], [test_copysign_f64_param_1];
+; CHECK-DAG:  ld.param.b64    [[BD:%rd[0-9]+]], [test_copysign_f64_param_1];
 ; CHECK-DAG:  mov.b64         [[B:%rd[0-9]+]], [[BD]];
 ; CHECK-DAG:  and.b16         [[AX:%rs[0-9]+]], [[AH]], 32767;
 ; CHECK-DAG:  and.b64         [[BX0:%rd[0-9]+]], [[B]], -9223372036854775808;
@@ -1080,8 +1080,8 @@ define half @test_copysign_f64(half %a, double %b) #0 {
 ; CHECK-DAG:  and.b16         [[AX:%rs[0-9]+]], [[AH]], 32767;
 ; CHECK-DAG:  and.b16         [[BX:%rs[0-9]+]], [[BH]], -32768;
 ; CHECK:      or.b16          [[RX:%rs[0-9]+]], [[AX]], [[BX]];
-; CHECK-NOFTZ: cvt.f32.f16     [[XR:%f[0-9]+]], [[RX]];
-; CHECK-F16-FTZ:   cvt.ftz.f32.f16 [[XR:%f[0-9]+]], [[RX]];
+; CHECK-NOFTZ: cvt.f32.f16     [[XR:%r[0-9]+]], [[RX]];
+; CHECK-F16-FTZ:   cvt.ftz.f32.f16 [[XR:%r[0-9]+]], [[RX]];
 ; CHECK:      st.param.b32    [func_retval0], [[XR]];
 ; CHECK:      ret;
 define float @test_copysign_extended(half %a, half %b) #0 {
@@ -1168,10 +1168,10 @@ define half @test_round(half %a) #0 {
 ; CHECK-DAG:  ld.param.b16    [[C:%rs[0-9]+]], [test_fmuladd_param_2];
 ; CHECK-F16-NOFTZ:        fma.rn.f16     [[R:%rs[0-9]+]], [[A]], [[B]], [[C]];
 ; CHECK-F16-FTZ:        fma.rn.ftz.f16     [[R:%rs[0-9]+]], [[A]], [[B]], [[C]];
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[A32:%f[0-9]+]], [[A]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%f[0-9]+]], [[B]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[C32:%f[0-9]+]], [[C]]
-; CHECK-NOF16-NEXT: fma.rn.f32     [[R32:%f[0-9]+]], [[A32]], [[B32]], [[C32]];
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[A32:%r[0-9]+]], [[A]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%r[0-9]+]], [[B]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[C32:%r[0-9]+]], [[C]]
+; CHECK-NOF16-NEXT: fma.rn.f32     [[R32:%r[0-9]+]], [[A32]], [[B32]], [[C32]];
 ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[R32]]
 ; CHECK:      st.param.b16    [func_retval0], [[R]];
 ; CHECK:      ret;
diff --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
index 7fef947a0e599..636ca801e97b7 100644
--- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
@@ -108,24 +108,23 @@ define <2 x half> @test_fadd(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-LABEL: test_fadd(
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .b16 %rs<7>;
-; CHECK-NOF16-NEXT:    .reg .b32 %r<4>;
-; CHECK-NOF16-NEXT:    .reg .b32 %f<7>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<10>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fadd_param_1];
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fadd_param_0];
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
-; CHECK-NOF16-NEXT:    add.rn.f32 %f3, %f2, %f1;
-; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs5, %f3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f5, %rs3;
-; CHECK-NOF16-NEXT:    add.rn.f32 %f6, %f5, %f4;
-; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs6, %f6;
-; CHECK-NOF16-NEXT:    mov.b32 %r3, {%rs6, %rs5};
-; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-NOF16-NEXT:    add.rn.f32 %r5, %r4, %r3;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs5, %r5;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r7, %rs3;
+; CHECK-NOF16-NEXT:    add.rn.f32 %r8, %r7, %r6;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs6, %r8;
+; CHECK-NOF16-NEXT:    mov.b32 %r9, {%rs6, %rs5};
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r9;
 ; CHECK-NOF16-NEXT:    ret;
   %r = fadd <2 x half> %a, %b
   ret <2 x half> %r
@@ -147,20 +146,19 @@ define <2 x half> @test_fadd_imm_0(<2 x half> %a) #0 {
 ; CHECK-NOF16-LABEL: test_fadd_imm_0(
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .b16 %rs<5>;
-; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
-; CHECK-NOF16-NEXT:    .reg .b32 %f<5>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fadd_imm_0_param_0];
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
-; CHECK-NOF16-NEXT:    add.rn.f32 %f2, %f1, 0f40000000;
-; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs3, %f2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs1;
-; CHECK-NOF16-NEXT:    add.rn.f32 %f4, %f3, 0f3F800000;
-; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs4, %f4;
-; CHECK-NOF16-NEXT:    mov.b32 %r2, {%rs4, %rs3};
-; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r2, %rs2;
+; CHECK-NOF16-NEXT:    add.rn.f32 %r3, %r2, 0f40000000;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs3, %r3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs1;
+; CHECK-NOF16-NEXT:    add.rn.f32 %r5, %r4, 0f3F800000;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs4, %r5;
+; CHECK-NOF16-NEXT:    mov.b32 %r6, {%rs4, %rs3};
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r6;
 ; CHECK-NOF16-NEXT:    ret;
   %r = fadd <2 x half> <half 1.0, half 2.0>, %a
   ret <2 x half> %r
@@ -181,20 +179,19 @@ define <2 x half> @test_fadd_imm_1(<2 x half> %a) #0 {
 ; CHECK-NOF16-LABEL: test_fadd_imm_1(
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .b16 %rs<5>;
-; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
-; CHECK-NOF16-NEXT:    .reg .b32 %f<5>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fadd_imm_1_param_0];
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
-; CHECK-NOF16-NEXT:    add.rn.f32 %f2, %f1, 0f40000000;
-; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs3, %f2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs1;
-; CHECK-NOF16-NEXT:    add.rn.f32 %f4, %f3, 0f3F800000;
-; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs4, %f4;
-; CHECK-NOF16-NEXT:    mov.b32 %r2, {%rs4, %rs3};
-; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r2, %rs2;
+; CHECK-NOF16-NEXT:    add.rn.f32 %r3, %r2, 0f40000000;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs3, %r3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs1;
+; CHECK-NOF16-NEXT:    add.rn.f32 %r5, %r4, 0f3F800000;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs4, %r5;
+; CHECK-NOF16-NEXT:    mov.b32 %r6, {%rs4, %rs3};
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r6;
 ; CHECK-NOF16-NEXT:    ret;
   %r = fadd <2 x half> %a, <half 1.0, half 2.0>
   ret <2 x half> %r
@@ -215,24 +212,23 @@ define <2 x half> @test_fsub(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-LABEL: test_fsub(
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .b16 %rs<7>;
-; CHECK-NOF16-NEXT:    .reg .b32 %r<4>;
-; CHECK-NOF16-NEXT:    .reg .b32 %f<7>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<10>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fsub_param_1];
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fsub_param_0];
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
-; CHECK-NOF16-NEXT:    sub.rn.f32 %f3, %f2, %f1;
-; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs5, %f3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f5, %rs3;
-; CHECK-NOF16-NEXT:    sub.rn.f32 %f6, %f5, %f4;
-; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs6, %f6;
-; CHECK-NOF16-NEXT:    mov.b32 %r3, {%rs6, %rs5};
-; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-NOF16-NEXT:    sub.rn.f32 %r5, %r4, %r3;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs5, %r5;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r7, %rs3;
+; CHECK-NOF16-NEXT:    sub.rn.f32 %r8, %r7, %r6;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs6, %r8;
+; CHECK-NOF16-NEXT:    mov.b32 %r9, {%rs6, %rs5};
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r9;
 ; CHECK-NOF16-NEXT:    ret;
   %r = fsub <2 x half> %a, %b
   ret <2 x half> %r
@@ -253,21 +249,20 @@ define <2 x half> @test_fneg(<2 x half> %a) #0 {
 ; CHECK-NOF16-LABEL: test_fneg(
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .b16 %rs<5>;
-; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
-; CHECK-NOF16-NEXT:    .reg .b32 %f<6>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<8>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fneg_param_0];
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
-; CHECK-NOF16-NEXT:    mov.b32 %f2, 0f00000000;
-; CHECK-NOF16-NEXT:    sub.rn.f32 %f3, %f2, %f1;
-; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs3, %f3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs1;
-; CHECK-NOF16-NEXT:    sub.rn.f32 %f5, %f2, %f4;
-; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs4, %f5;
-; CHECK-NOF16-NEXT:    mov.b32 %r2, {%rs4, %rs3};
-; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r2, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 %r3, 0f00000000;
+; CHECK-NOF16-NEXT:    sub.rn.f32 %r4, %r3, %r2;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs3, %r4;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
+; CHECK-NOF16-NEXT:    sub.rn.f32 %r6, %r3, %r5;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs4, %r6;
+; CHECK-NOF16-NEXT:    mov.b32 %r7, {%rs4, %rs3};
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r7;
 ; CHECK-NOF16-NEXT:    ret;
   %r = fsub <2 x half> <half 0.0, half 0.0>, %a
   ret <2 x half> %r
@@ -288,24 +283,23 @@ define <2 x half> @test_fmul(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-LABEL: test_fmul(
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .b16 %rs<7>;
-; CHECK-NOF16-NEXT:    .reg .b32 %r<4>;
-; CHECK-NOF16-NEXT:    .reg .b32 %f<7>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<10>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fmul_param_1];
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fmul_param_0];
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
-; CHECK-NOF16-NEXT:    mul.rn.f32 %f3, %f2, %f1;
-; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs5, %f3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f5, %rs3;
-; CHECK-NOF16-NEXT:    mul.rn.f32 %f6, %f5, %f4;
-; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs6, %f6;
-; CHECK-NOF16-NEXT:    mov.b32 %r3, {%rs6, %rs5};
-; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-NOF16-NEXT:    mul.rn.f32 %r5, %r4, %r3;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs5, %r5;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r7, %rs3;
+; CHECK-NOF16-NEXT:    mul.rn.f32 %r8, %r7, %r6;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs6, %r8;
+; CHECK-NOF16-NEXT:    mov.b32 %r9, {%rs6, %rs5};
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r9;
 ; CHECK-NOF16-NEXT:    ret;
   %r = fmul <2 x half> %a, %b
   ret <2 x half> %r
@@ -315,24 +309,23 @@ define <2 x half> @test_fdiv(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-LABEL: test_fdiv(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<7>;
-; CHECK-NEXT:    .reg .b32 %r<4>;
-; CHECK-NEXT:    .reg .b32 %f<7>;
+; CHECK-NEXT:    .reg .b32 %r<10>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r2, [test_fdiv_param_1];
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_fdiv_param_0];
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NEXT:    cvt.f32.f16 %r3, %rs2;
 ; CHECK-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NEXT:    cvt.f32.f16 %f2, %rs4;
-; CHECK-NEXT:    div.rn.f32 %f3, %f2, %f1;
-; CHECK-NEXT:    cvt.rn.f16.f32 %rs5, %f3;
-; CHECK-NEXT:    cvt.f32.f16 %f4, %rs1;
-; CHECK-NEXT:    cvt.f32.f16 %f5, %rs3;
-; CHECK-NEXT:    div.rn.f32 %f6, %f5, %f4;
-; CHECK-NEXT:    cvt.rn.f16.f32 %rs6, %f6;
-; CHECK-NEXT:    mov.b32 %r3, {%rs6, %rs5};
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-NEXT:    div.rn.f32 %r5, %r4, %r3;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs5, %r5;
+; CHECK-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NEXT:    cvt.f32.f16 %r7, %rs3;
+; CHECK-NEXT:    div.rn.f32 %r8, %r7, %r6;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs6, %r8;
+; CHECK-NEXT:    mov.b32 %r9, {%rs6, %rs5};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r9;
 ; CHECK-NEXT:    ret;
   %r = fdiv <2 x half> %a, %b
   ret <2 x half> %r
@@ -350,34 +343,33 @@ define <2 x half> @test_frem(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<3>;
 ; CHECK-NEXT:    .reg .b16 %rs<7>;
-; CHECK-NEXT:    .reg .b32 %r<4>;
-; CHECK-NEXT:    .reg .b32 %f<15>;
+; CHECK-NEXT:    .reg .b32 %r<18>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r2, [test_frem_param_1];
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_frem_param_0];
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NEXT:    cvt.f32.f16 %r3, %rs2;
 ; CHECK-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NEXT:    cvt.f32.f16 %f2, %rs4;
-; CHECK-NEXT:    div.rn.f32 %f3, %f2, %f1;
-; CHECK-NEXT:    cvt.rzi.f32.f32 %f4, %f3;
-; CHECK-NEXT:    neg.f32 %f5, %f4;
-; CHECK-NEXT:    fma.rn.f32 %f6, %f5, %f1, %f2;
-; CHECK-NEXT:    testp.infinite.f32 %p1, %f1;
-; CHECK-NEXT:    selp.f32 %f7, %f2, %f6, %p1;
-; CHECK-NEXT:    cvt.rn.f16.f32 %rs5, %f7;
-; CHECK-NEXT:    cvt.f32.f16 %f8, %rs1;
-; CHECK-NEXT:    cvt.f32.f16 %f9, %rs3;
-; CHECK-NEXT:    div.rn.f32 %f10, %f9, %f8;
-; CHECK-NEXT:    cvt.rzi.f32.f32 %f11, %f10;
-; CHECK-NEXT:    neg.f32 %f12, %f11;
-; CHECK-NEXT:    fma.rn.f32 %f13, %f12, %f8, %f9;
-; CHECK-NEXT:    testp.infinite.f32 %p2, %f8;
-; CHECK-NEXT:    selp.f32 %f14, %f9, %f13, %p2;
-; CHECK-NEXT:    cvt.rn.f16.f32 %rs6, %f14;
-; CHECK-NEXT:    mov.b32 %r3, {%rs6, %rs5};
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-NEXT:    div.rn.f32 %r5, %r4, %r3;
+; CHECK-NEXT:    cvt.rzi.f32.f32 %r6, %r5;
+; CHECK-NEXT:    neg.f32 %r7, %r6;
+; CHECK-NEXT:    fma.rn.f32 %r8, %r7, %r3, %r4;
+; CHECK-NEXT:    testp.infinite.f32 %p1, %r3;
+; CHECK-NEXT:    selp.f32 %r9, %r4, %r8, %p1;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs5, %r9;
+; CHECK-NEXT:    cvt.f32.f16 %r10, %rs1;
+; CHECK-NEXT:    cvt.f32.f16 %r11, %rs3;
+; CHECK-NEXT:    div.rn.f32 %r12, %r11, %r10;
+; CHECK-NEXT:    cvt.rzi.f32.f32 %r13, %r12;
+; CHECK-NEXT:    neg.f32 %r14, %r13;
+; CHECK-NEXT:    fma.rn.f32 %r15, %r14, %r10, %r11;
+; CHECK-NEXT:    testp.infinite.f32 %p2, %r10;
+; CHECK-NEXT:    selp.f32 %r16, %r11, %r15, %p2;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs6, %r16;
+; CHECK-NEXT:    mov.b32 %r17, {%rs6, %rs5};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r17;
 ; CHECK-NEXT:    ret;
   %r = frem <2 x half> %a, %b
   ret <2 x half> %r
@@ -590,8 +582,7 @@ define <2 x half> @test_select_cc(<2 x half> %a, <2 x half> %b, <2 x half> %c, <
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .pred %p<3>;
 ; CHECK-NOF16-NEXT:    .reg .b16 %rs<11>;
-; CHECK-NOF16-NEXT:    .reg .b32 %r<6>;
-; CHECK-NOF16-NEXT:    .reg .b32 %f<5>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<10>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r4, [test_select_cc_param_3];
@@ -599,19 +590,19 @@ define <2 x half> @test_select_cc(<2 x half> %a, <2 x half> %b, <2 x half> %c, <
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_select_cc_param_1];
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_select_cc_param_0];
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs3;
-; CHECK-NOF16-NEXT:    setp.neu.f32 %p1, %f2, %f1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs4;
-; CHECK-NOF16-NEXT:    setp.neu.f32 %p2, %f4, %f3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
+; CHECK-NOF16-NEXT:    setp.neu.f32 %p1, %r6, %r5;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r7, %rs2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r8, %rs4;
+; CHECK-NOF16-NEXT:    setp.neu.f32 %p2, %r8, %r7;
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs5, %rs6}, %r2;
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs7, %rs8}, %r1;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs9, %rs8, %rs6, %p2;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs10, %rs7, %rs5, %p1;
-; CHECK-NOF16-NEXT:    mov.b32 %r5, {%rs10, %rs9};
-; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r5;
+; CHECK-NOF16-NEXT:    mov.b32 %r9, {%rs10, %rs9};
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r9;
 ; CHECK-NOF16-NEXT:    ret;
   %cc = fcmp une <2 x half> %c, %d
   %r = select <2 x i1> %cc, <2 x half> %a, <2 x half> %b
@@ -622,43 +613,41 @@ define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b,
 ; CHECK-F16-LABEL: test_select_cc_f32_f16(
 ; CHECK-F16:       {
 ; CHECK-F16-NEXT:    .reg .pred %p<3>;
-; CHECK-F16-NEXT:    .reg .b32 %r<3>;
-; CHECK-F16-NEXT:    .reg .b32 %f<7>;
+; CHECK-F16-NEXT:    .reg .b32 %r<9>;
 ; CHECK-F16-EMPTY:
 ; CHECK-F16-NEXT:  // %bb.0:
-; CHECK-F16-NEXT:    ld.param.v2.b32 {%f3, %f4}, [test_select_cc_f32_f16_param_1];
-; CHECK-F16-NEXT:    ld.param.v2.b32 {%f1, %f2}, [test_select_cc_f32_f16_param_0];
-; CHECK-F16-NEXT:    ld.param.b32 %r2, [test_select_cc_f32_f16_param_3];
-; CHECK-F16-NEXT:    ld.param.b32 %r1, [test_select_cc_f32_f16_param_2];
-; CHECK-F16-NEXT:    setp.neu.f16x2 %p1|%p2, %r1, %r2;
-; CHECK-F16-NEXT:    selp.f32 %f5, %f2, %f4, %p2;
-; CHECK-F16-NEXT:    selp.f32 %f6, %f1, %f3, %p1;
-; CHECK-F16-NEXT:    st.param.v2.b32 [func_retval0], {%f6, %f5};
+; CHECK-F16-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f32_f16_param_1];
+; CHECK-F16-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_select_cc_f32_f16_param_0];
+; CHECK-F16-NEXT:    ld.param.b32 %r6, [test_select_cc_f32_f16_param_3];
+; CHECK-F16-NEXT:    ld.param.b32 %r5, [test_select_cc_f32_f16_param_2];
+; CHECK-F16-NEXT:    setp.neu.f16x2 %p1|%p2, %r5, %r6;
+; CHECK-F16-NEXT:    selp.f32 %r7, %r2, %r4, %p2;
+; CHECK-F16-NEXT:    selp.f32 %r8, %r1, %r3, %p1;
+; CHECK-F16-NEXT:    st.param.v2.b32 [func_retval0], {%r8, %r7};
 ; CHECK-F16-NEXT:    ret;
 ;
 ; CHECK-NOF16-LABEL: test_select_cc_f32_f16(
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .pred %p<3>;
 ; CHECK-NOF16-NEXT:    .reg .b16 %rs<5>;
-; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
-; CHECK-NOF16-NEXT:    .reg .b32 %f<11>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<13>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b32 {%f3, %f4}, [test_select_cc_f32_f16_param_1];
-; CHECK-NOF16-NEXT:    ld.param.v2.b32 {%f1, %f2}, [test_select_cc_f32_f16_param_0];
-; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_select_cc_f32_f16_param_3];
-; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_select_cc_f32_f16_param_2];
-; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f5, %rs1;
-; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f6, %rs3;
-; CHECK-NOF16-NEXT:    setp.neu.f32 %p1, %f6, %f5;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f7, %rs2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f8, %rs4;
-; CHECK-NOF16-NEXT:    setp.neu.f32 %p2, %f8, %f7;
-; CHECK-NOF16-NEXT:    selp.f32 %f9, %f2, %f4, %p2;
-; CHECK-NOF16-NEXT:    selp.f32 %f10, %f1, %f3, %p1;
-; CHECK-NOF16-NEXT:    st.param.v2.b32 [func_retval0], {%f10, %f9};
+; CHECK-NOF16-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f32_f16_param_1];
+; CHECK-NOF16-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_select_cc_f32_f16_param_0];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r6, [test_select_cc_f32_f16_param_3];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r5, [test_select_cc_f32_f16_param_2];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r6;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r7, %rs1;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r5;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r8, %rs3;
+; CHECK-NOF16-NEXT:    setp.neu.f32 %p1, %r8, %r7;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r9, %rs2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r10, %rs4;
+; CHECK-NOF16-NEXT:    setp.neu.f32 %p2, %r10, %r9;
+; CHECK-NOF16-NEXT:    selp.f32 %r11, %r2, %r4, %p2;
+; CHECK-NOF16-NEXT:    selp.f32 %r12, %r1, %r3, %p1;
+; CHECK-NOF16-NEXT:    st.param.v2.b32 [func_retval0], {%r12, %r11};
 ; CHECK-NOF16-NEXT:    ret;
                                            <2 x half> %c, <2 x half> %d) #0 {
   %cc = fcmp une <2 x half> %c, %d
@@ -671,22 +660,21 @@ define <2 x half> @test_select_cc_f16_f32(<2 x half> %a, <2 x half> %b,
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<3>;
 ; CHECK-NEXT:    .reg .b16 %rs<7>;
-; CHECK-NEXT:    .reg .b32 %r<4>;
-; CHECK-NEXT:    .reg .b32 %f<5>;
+; CHECK-NEXT:    .reg .b32 %r<8>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%f3, %f4}, [test_select_cc_f16_f32_param_3];
-; CHECK-NEXT:    ld.param.v2.b32 {%f1, %f2}, [test_select_cc_f16_f32_param_2];
+; CHECK-NEXT:    ld.param.v2.b32 {%r5, %r6}, [test_select_cc_f16_f32_param_3];
+; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f16_f32_param_2];
 ; CHECK-NEXT:    ld.param.b32 %r2, [test_select_cc_f16_f32_param_1];
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_select_cc_f16_f32_param_0];
-; CHECK-NEXT:    setp.neu.f32 %p1, %f1, %f3;
-; CHECK-NEXT:    setp.neu.f32 %p2, %f2, %f4;
+; CHECK-NEXT:    setp.neu.f32 %p1, %r3, %r5;
+; CHECK-NEXT:    setp.neu.f32 %p2, %r4, %r6;
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
 ; CHECK-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
 ; CHECK-NEXT:    selp.b16 %rs5, %rs4, %rs2, %p2;
 ; CHECK-NEXT:    selp.b16 %rs6, %rs3, %rs1, %p1;
-; CHECK-NEXT:    mov.b32 %r3, {%rs6, %rs5};
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    mov.b32 %r7, {%rs6, %rs5};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r7;
 ; CHECK-NEXT:    ret;
                                           <2 x float> %c, <2 x float> %d) #0 {
   %cc = fcmp une <2 x float> %c, %d
@@ -715,20 +703,19 @@ define <2 x i1> @test_fcmp_une(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .pred %p<3>;
 ; CHECK-NOF16-NEXT:    .reg .b16 %rs<7>;
-; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
-; CHECK-NOF16-NEXT:    .reg .b32 %f<5>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_une_param_1];
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_une_param_0];
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
-; CHECK-NOF16-NEXT:    setp.neu.f32 %p1, %f2, %f1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs3;
-; CHECK-NOF16-NEXT:    setp.neu.f32 %p2, %f4, %f3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-NOF16-NEXT:    setp.neu.f32 %p1, %r4, %r3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
+; CHECK-NOF16-NEXT:    setp.neu.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs6, -1, 0, %p1;
@@ -759,20 +746,19 @@ define <2 x i1> @test_fcmp_ueq(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .pred %p<3>;
 ; CHECK-NOF16-NEXT:    .reg .b16 %rs<7>;
-; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
-; CHECK-NOF16-NEXT:    .reg .b32 %f<5>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_ueq_param_1];
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_ueq_param_0];
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
-; CHECK-NOF16-NEXT:    setp.equ.f32 %p1, %f2, %f1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs3;
-; CHECK-NOF16-NEXT:    setp.equ.f32 %p2, %f4, %f3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-NOF16-NEXT:    setp.equ.f32 %p1, %r4, %r3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
+; CHECK-NOF16-NEXT:    setp.equ.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs6, -1, 0, %p1;
@@ -803,20 +789,19 @@ define <2 x i1> @test_fcmp_ugt(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .pred %p<3>;
 ; CHECK-NOF16-NEXT:    .reg .b16 %rs<7>;
-; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
-; CHECK-NOF16-NEXT:    .reg .b32 %f<5>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_ugt_param_1];
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_ugt_param_0];
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
-; CHECK-NOF16-NEXT:    setp.gtu.f32 %p1, %f2, %f1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs3;
-; CHECK-NOF16-NEXT:    setp.gtu.f32 %p2, %f4, %f3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-NOF16-NEXT:    setp.gtu.f32 %p1, %r4, %r3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
+; CHECK-NOF16-NEXT:    setp.gtu.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs6, -1, 0, %p1;
@@ -847,20 +832,19 @@ define <2 x i1> @test_fcmp_uge(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .pred %p<3>;
 ; CHECK-NOF16-NEXT:    .reg .b16 %rs<7>;
-; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
-; CHECK-NOF16-NEXT:    .reg .b32 %f<5>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_uge_param_1];
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_uge_param_0];
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
-; CHECK-NOF16-NEXT:    setp.geu.f32 %p1, %f2, %f1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs3;
-; CHECK-NOF16-NEXT:    setp.geu.f32 %p2, %f4, %f3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-NOF16-NEXT:    setp.geu.f32 %p1, %r4, %r3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
+; CHECK-NOF16-NEXT:    setp.geu.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs6, -1, 0, %p1;
@@ -891,20 +875,19 @@ define <2 x i1> @test_fcmp_ult(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .pred %p<3>;
 ; CHECK-NOF16-NEXT:    .reg .b16 %rs<7>;
-; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
-; CHECK-NOF16-NEXT:    .reg .b32 %f<5>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_ult_param_1];
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_ult_param_0];
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
-; CHECK-NOF16-NEXT:    setp.ltu.f32 %p1, %f2, %f1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs3;
-; CHECK-NOF16-NEXT:    setp.ltu.f32 %p2, %f4, %f3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-NOF16-NEXT:    setp.ltu.f32 %p1, %r4, %r3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
+; CHECK-NOF16-NEXT:    setp.ltu.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs6, -1, 0, %p1;
@@ -935,20 +918,19 @@ define <2 x i1> @test_fcmp_ule(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .pred %p<3>;
 ; CHECK-NOF16-NEXT:    .reg .b16 %rs<7>;
-; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
-; CHECK-NOF16-NEXT:    .reg .b32 %f<5>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_ule_param_1];
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_ule_param_0];
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
-; CHECK-NOF16-NEXT:    setp.leu.f32 %p1, %f2, %f1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs3;
-; CHECK-NOF16-NEXT:    setp.leu.f32 %p2, %f4, %f3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-NOF16-NEXT:    setp.leu.f32 %p1, %r4, %r3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
+; CHECK-NOF16-NEXT:    setp.leu.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs6, -1, 0, %p1;
@@ -980,20 +962,19 @@ define <2 x i1> @test_fcmp_uno(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .pred %p<3>;
 ; CHECK-NOF16-NEXT:    .reg .b16 %rs<7>;
-; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
-; CHECK-NOF16-NEXT:    .reg .b32 %f<5>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_uno_param_1];
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_uno_param_0];
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
-; CHECK-NOF16-NEXT:    setp.nan.f32 %p1, %f2, %f1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs3;
-; CHECK-NOF16-NEXT:    setp.nan.f32 %p2, %f4, %f3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-NOF16-NEXT:    setp.nan.f32 %p1, %r4, %r3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
+; CHECK-NOF16-NEXT:    setp.nan.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs6, -1, 0, %p1;
@@ -1024,20 +1005,19 @@ define <2 x i1> @test_fcmp_one(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .pred %p<3>;
 ; CHECK-NOF16-NEXT:    .reg .b16 %rs<7>;
-; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
-; CHECK-NOF16-NEXT:    .reg .b32 %f<5>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_one_param_1];
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_one_param_0];
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
-; CHECK-NOF16-NEXT:    setp.ne.f32 %p1, %f2, %f1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs3;
-; CHECK-NOF16-NEXT:    setp.ne.f32 %p2, %f4, %f3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-NOF16-NEXT:    setp.ne.f32 %p1, %r4, %r3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
+; CHECK-NOF16-NEXT:    setp.ne.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs6, -1, 0, %p1;
@@ -1068,20 +1048,19 @@ define <2 x i1> @test_fcmp_oeq(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .pred %p<3>;
 ; CHECK-NOF16-NEXT:    .reg .b16 %rs<7>;
-; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
-; CHECK-NOF16-NEXT:    .reg .b32 %f<5>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_oeq_param_1];
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_oeq_param_0];
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
-; CHECK-NOF16-NEXT:    setp.eq.f32 %p1, %f2, %f1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs3;
-; CHECK-NOF16-NEXT:    setp.eq.f32 %p2, %f4, %f3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-NOF16-NEXT:    setp.eq.f32 %p1, %r4, %r3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
+; CHECK-NOF16-NEXT:    setp.eq.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs6, -1, 0, %p1;
@@ -1112,20 +1091,19 @@ define <2 x i1> @test_fcmp_ogt(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .pred %p<3>;
 ; CHECK-NOF16-NEXT:    .reg .b16 %rs<7>;
-; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
-; CHECK-NOF16-NEXT:    .reg .b32 %f<5>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_ogt_param_1];
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_ogt_param_0];
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
-; CHECK-NOF16-NEXT:    setp.gt.f32 %p1, %f2, %f1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs3;
-; CHECK-NOF16-NEXT:    setp.gt.f32 %p2, %f4, %f3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-NOF16-NEXT:    setp.gt.f32 %p1, %r4, %r3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
+; CHECK-NOF16-NEXT:    setp.gt.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs6, -1, 0, %p1;
@@ -1156,20 +1134,19 @@ define <2 x i1> @test_fcmp_oge(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .pred %p<3>;
 ; CHECK-NOF16-NEXT:    .reg .b16 %rs<7>;
-; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
-; CHECK-NOF16-NEXT:    .reg .b32 %f<5>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_oge_param_1];
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_oge_param_0];
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
-; CHECK-NOF16-NEXT:    setp.ge.f32 %p1, %f2, %f1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs3;
-; CHECK-NOF16-NEXT:    setp.ge.f32 %p2, %f4, %f3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-NOF16-NEXT:    setp.ge.f32 %p1, %r4, %r3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
+; CHECK-NOF16-NEXT:    setp.ge.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs6, -1, 0, %p1;
@@ -1200,20 +1177,19 @@ define <2 x i1> @test_fcmp_olt(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .pred %p<3>;
 ; CHECK-NOF16-NEXT:    .reg .b16 %rs<7>;
-; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
-; CHECK-NOF16-NEXT:    .reg .b32 %f<5>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_olt_param_1];
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_olt_param_0];
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
-; CHECK-NOF16-NEXT:    setp.lt.f32 %p1, %f2, %f1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs3;
-; CHECK-NOF16-NEXT:    setp.lt.f32 %p2, %f4, %f3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-NOF16-NEXT:    setp.lt.f32 %p1, %r4, %r3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
+; CHECK-NOF16-NEXT:    setp.lt.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs6, -1, 0, %p1;
@@ -1244,20 +1220,19 @@ define <2 x i1> @test_fcmp_ole(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .pred %p<3>;
 ; CHECK-NOF16-NEXT:    .reg .b16 %rs<7>;
-; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
-; CHECK-NOF16-NEXT:    .reg .b32 %f<5>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_ole_param_1];
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_ole_param_0];
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
-; CHECK-NOF16-NEXT:    setp.le.f32 %p1, %f2, %f1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs3;
-; CHECK-NOF16-NEXT:    setp.le.f32 %p2, %f4, %f3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-NOF16-NEXT:    setp.le.f32 %p1, %r4, %r3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
+; CHECK-NOF16-NEXT:    setp.le.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs6, -1, 0, %p1;
@@ -1288,20 +1263,19 @@ define <2 x i1> @test_fcmp_ord(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .pred %p<3>;
 ; CHECK-NOF16-NEXT:    .reg .b16 %rs<7>;
-; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
-; CHECK-NOF16-NEXT:    .reg .b32 %f<5>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_ord_param_1];
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_ord_param_0];
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
-; CHECK-NOF16-NEXT:    setp.num.f32 %p1, %f2, %f1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs3;
-; CHECK-NOF16-NEXT:    setp.num.f32 %p2, %f4, %f3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-NOF16-NEXT:    setp.num.f32 %p1, %r4, %r3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
+; CHECK-NOF16-NEXT:    setp.num.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs6, -1, 0, %p1;
@@ -1471,25 +1445,24 @@ define <2 x half> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-LABEL: test_uitofp_2xi32_fadd(
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .b16 %rs<7>;
-; CHECK-NOF16-NEXT:    .reg .b32 %r<5>;
-; CHECK-NOF16-NEXT:    .reg .b32 %f<7>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<11>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
 ; CHECK-NOF16-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_fadd_param_0];
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r3, [test_uitofp_2xi32_fadd_param_1];
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.u32 %rs1, %r1;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.u32 %rs2, %r2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
-; CHECK-NOF16-NEXT:    add.rn.f32 %f3, %f2, %f1;
-; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs5, %f3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f5, %rs3;
-; CHECK-NOF16-NEXT:    add.rn.f32 %f6, %f5, %f4;
-; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs6, %f6;
-; CHECK-NOF16-NEXT:    mov.b32 %r4, {%rs6, %rs5};
-; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r4;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs4;
+; CHECK-NOF16-NEXT:    add.rn.f32 %r6, %r5, %r4;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs5, %r6;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r7, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r8, %rs3;
+; CHECK-NOF16-NEXT:    add.rn.f32 %r9, %r8, %r7;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs6, %r9;
+; CHECK-NOF16-NEXT:    mov.b32 %r10, {%rs6, %rs5};
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r10;
 ; CHECK-NOF16-NEXT:    ret;
   %c = uitofp <2 x i32> %a to <2 x half>
   %r = fadd <2 x half> %b, %c
@@ -1515,25 +1488,24 @@ define <2 x half> @test_sitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-LABEL: test_sitofp_2xi32_fadd(
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .b16 %rs<7>;
-; CHECK-NOF16-NEXT:    .reg .b32 %r<5>;
-; CHECK-NOF16-NEXT:    .reg .b32 %f<7>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<11>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
 ; CHECK-NOF16-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_sitofp_2xi32_fadd_param_0];
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r3, [test_sitofp_2xi32_fadd_param_1];
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.s32 %rs1, %r1;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.s32 %rs2, %r2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
-; CHECK-NOF16-NEXT:    add.rn.f32 %f3, %f2, %f1;
-; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs5, %f3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f5, %rs3;
-; CHECK-NOF16-NEXT:    add.rn.f32 %f6, %f5, %f4;
-; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs6, %f6;
-; CHECK-NOF16-NEXT:    mov.b32 %r4, {%rs6, %rs5};
-; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r4;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs4;
+; CHECK-NOF16-NEXT:    add.rn.f32 %r6, %r5, %r4;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs5, %r6;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r7, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r8, %rs3;
+; CHECK-NOF16-NEXT:    add.rn.f32 %r9, %r8, %r7;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs6, %r9;
+; CHECK-NOF16-NEXT:    mov.b32 %r10, {%rs6, %rs5};
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r10;
 ; CHECK-NOF16-NEXT:    ret;
   %c = sitofp <2 x i32> %a to <2 x half>
   %r = fadd <2 x half> %b, %c
@@ -1544,15 +1516,14 @@ define <2 x half> @test_fptrunc_2xfloat(<2 x float> %a) #0 {
 ; CHECK-LABEL: test_fptrunc_2xfloat(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<3>;
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%f1, %f2}, [test_fptrunc_2xfloat_param_0];
-; CHECK-NEXT:    cvt.rn.f16.f32 %rs1, %f2;
-; CHECK-NEXT:    cvt.rn.f16.f32 %rs2, %f1;
-; CHECK-NEXT:    mov.b32 %r1, {%rs2, %rs1};
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fptrunc_2xfloat_param_0];
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs1, %r2;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs2, %r1;
+; CHECK-NEXT:    mov.b32 %r3, {%rs2, %rs1};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
   %r = fptrunc <2 x float> %a to <2 x half>
   ret <2 x half> %r
@@ -1563,12 +1534,12 @@ define <2 x half> @test_fptrunc_2xdouble(<2 x double> %a) #0 {
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<3>;
 ; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b64 %fd<3>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b64 {%fd1, %fd2}, [test_fptrunc_2xdouble_param_0];
-; CHECK-NEXT:    cvt.rn.f16.f64 %rs1, %fd2;
-; CHECK-NEXT:    cvt.rn.f16.f64 %rs2, %fd1;
+; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [test_fptrunc_2xdouble_param_0];
+; CHECK-NEXT:    cvt.rn.f16.f64 %rs1, %rd2;
+; CHECK-NEXT:    cvt.rn.f16.f64 %rs2, %rd1;
 ; CHECK-NEXT:    mov.b32 %r1, {%rs2, %rs1};
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
@@ -1580,15 +1551,14 @@ define <2 x float> @test_fpext_2xfloat(<2 x half> %a) #0 {
 ; CHECK-LABEL: test_fpext_2xfloat(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<3>;
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_fpext_2xfloat_param_0];
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
-; CHECK-NEXT:    cvt.f32.f16 %f1, %rs2;
-; CHECK-NEXT:    cvt.f32.f16 %f2, %rs1;
-; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%f2, %f1};
+; CHECK-NEXT:    cvt.f32.f16 %r2, %rs2;
+; CHECK-NEXT:    cvt.f32.f16 %r3, %rs1;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r3, %r2};
 ; CHECK-NEXT:    ret;
   %r = fpext <2 x half> %a to <2 x float>
   ret <2 x float> %r
@@ -1599,14 +1569,14 @@ define <2 x double> @test_fpext_2xdouble(<2 x half> %a) #0 {
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<3>;
 ; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b64 %fd<3>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_fpext_2xdouble_param_0];
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
-; CHECK-NEXT:    cvt.f64.f16 %fd1, %rs2;
-; CHECK-NEXT:    cvt.f64.f16 %fd2, %rs1;
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%fd2, %fd1};
+; CHECK-NEXT:    cvt.f64.f16 %rd1, %rs2;
+; CHECK-NEXT:    cvt.f64.f16 %rd2, %rs1;
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd2, %rd1};
 ; CHECK-NEXT:    ret;
   %r = fpext <2 x half> %a to <2 x double>
   ret <2 x double> %r
@@ -1642,13 +1612,12 @@ define <2 x half> @test_bitcast_2xi16_to_2xhalf(<2 x i16> %a) #0 {
 define <2 x half> @test_bitcast_float_to_2xhalf(float %a) #0 {
 ; CHECK-LABEL: test_bitcast_float_to_2xhalf(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<2>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [test_bitcast_float_to_2xhalf_param_0];
-; CHECK-NEXT:    mov.b32 %r1, %f1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ld.param.b32 %r1, [test_bitcast_float_to_2xhalf_param_0];
+; CHECK-NEXT:    mov.b32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %r = bitcast float %a to <2 x half>
   ret <2 x half> %r
@@ -1657,13 +1626,12 @@ define <2 x half> @test_bitcast_float_to_2xhalf(float %a) #0 {
 define float @test_bitcast_2xhalf_to_float(<2 x half> %a) #0 {
 ; CHECK-LABEL: test_bitcast_2xhalf_to_float(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<2>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_bitcast_2xhalf_to_float_param_0];
-; CHECK-NEXT:    mov.b32 %f1, %r1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f1;
+; CHECK-NEXT:    mov.b32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %r = bitcast <2 x half> %a to float
   ret float %r
@@ -1697,20 +1665,19 @@ define <2 x half> @test_sqrt(<2 x half> %a) #0 {
 ; CHECK-LABEL: test_sqrt(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<5>;
-; CHECK-NEXT:    .reg .b32 %r<3>;
-; CHECK-NEXT:    .reg .b32 %f<5>;
+; CHECK-NEXT:    .reg .b32 %r<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_sqrt_param_0];
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
-; CHECK-NEXT:    cvt.f32.f16 %f1, %rs2;
-; CHECK-NEXT:    sqrt.rn.f32 %f2, %f1;
-; CHECK-NEXT:    cvt.rn.f16.f32 %rs3, %f2;
-; CHECK-NEXT:    cvt.f32.f16 %f3, %rs1;
-; CHECK-NEXT:    sqrt.rn.f32 %f4, %f3;
-; CHECK-NEXT:    cvt.rn.f16.f32 %rs4, %f4;
-; CHECK-NEXT:    mov.b32 %r2, {%rs4, %rs3};
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    cvt.f32.f16 %r2, %rs2;
+; CHECK-NEXT:    sqrt.rn.f32 %r3, %r2;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs3, %r3;
+; CHECK-NEXT:    cvt.f32.f16 %r4, %rs1;
+; CHECK-NEXT:    sqrt.rn.f32 %r5, %r4;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs4, %r5;
+; CHECK-NEXT:    mov.b32 %r6, {%rs4, %rs3};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
 ; CHECK-NEXT:    ret;
   %r = call <2 x half> @llvm.sqrt.f16(<2 x half> %a)
   ret <2 x half> %r
@@ -1727,20 +1694,19 @@ define <2 x half> @test_sin(<2 x half> %a) #0 #1 {
 ; CHECK-LABEL: test_sin(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<5>;
-; CHECK-NEXT:    .reg .b32 %r<3>;
-; CHECK-NEXT:    .reg .b32 %f<5>;
+; CHECK-NEXT:    .reg .b32 %r<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_sin_param_0];
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
-; CHECK-NEXT:    cvt.f32.f16 %f1, %rs2;
-; CHECK-NEXT:    sin.approx.f32 %f2, %f1;
-; CHECK-NEXT:    cvt.rn.f16.f32 %rs3, %f2;
-; CHECK-NEXT:    cvt.f32.f16 %f3, %rs1;
-; CHECK-NEXT:    sin.approx.f32 %f4, %f3;
-; CHECK-NEXT:    cvt.rn.f16.f32 %rs4, %f4;
-; CHECK-NEXT:    mov.b32 %r2, {%rs4, %rs3};
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    cvt.f32.f16 %r2, %rs2;
+; CHECK-NEXT:    sin.approx.f32 %r3, %r2;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs3, %r3;
+; CHECK-NEXT:    cvt.f32.f16 %r4, %rs1;
+; CHECK-NEXT:    sin.approx.f32 %r5, %r4;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs4, %r5;
+; CHECK-NEXT:    mov.b32 %r6, {%rs4, %rs3};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
 ; CHECK-NEXT:    ret;
   %r = call <2 x half> @llvm.sin.f16(<2 x half> %a)
   ret <2 x half> %r
@@ -1750,20 +1716,19 @@ define <2 x half> @test_cos(<2 x half> %a) #0 #1 {
 ; CHECK-LABEL: test_cos(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<5>;
-; CHECK-NEXT:    .reg .b32 %r<3>;
-; CHECK-NEXT:    .reg .b32 %f<5>;
+; CHECK-NEXT:    .reg .b32 %r<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_cos_param_0];
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
-; CHECK-NEXT:    cvt.f32.f16 %f1, %rs2;
-; CHECK-NEXT:    cos.approx.f32 %f2, %f1;
-; CHECK-NEXT:    cvt.rn.f16.f32 %rs3, %f2;
-; CHECK-NEXT:    cvt.f32.f16 %f3, %rs1;
-; CHECK-NEXT:    cos.approx.f32 %f4, %f3;
-; CHECK-NEXT:    cvt.rn.f16.f32 %rs4, %f4;
-; CHECK-NEXT:    mov.b32 %r2, {%rs4, %rs3};
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    cvt.f32.f16 %r2, %rs2;
+; CHECK-NEXT:    cos.approx.f32 %r3, %r2;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs3, %r3;
+; CHECK-NEXT:    cvt.f32.f16 %r4, %rs1;
+; CHECK-NEXT:    cos.approx.f32 %r5, %r4;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs4, %r5;
+; CHECK-NEXT:    mov.b32 %r6, {%rs4, %rs3};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
 ; CHECK-NEXT:    ret;
   %r = call <2 x half> @llvm.cos.f16(<2 x half> %a)
   ret <2 x half> %r
@@ -1828,28 +1793,27 @@ define <2 x half> @test_fma(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 {
 ; CHECK-NOF16-LABEL: test_fma(
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .b16 %rs<9>;
-; CHECK-NOF16-NEXT:    .reg .b32 %r<5>;
-; CHECK-NOF16-NEXT:    .reg .b32 %f<9>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<13>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r3, [test_fma_param_2];
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fma_param_1];
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fma_param_0];
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs4;
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs6;
-; CHECK-NOF16-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
-; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs7, %f4;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f5, %rs1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f6, %rs3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f7, %rs5;
-; CHECK-NOF16-NEXT:    fma.rn.f32 %f8, %f7, %f6, %f5;
-; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs8, %f8;
-; CHECK-NOF16-NEXT:    mov.b32 %r4, {%rs8, %rs7};
-; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r4;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs6;
+; CHECK-NOF16-NEXT:    fma.rn.f32 %r7, %r6, %r5, %r4;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs7, %r7;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r8, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r9, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r10, %rs5;
+; CHECK-NOF16-NEXT:    fma.rn.f32 %r11, %r10, %r9, %r8;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs8, %r11;
+; CHECK-NOF16-NEXT:    mov.b32 %r12, {%rs8, %rs7};
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r12;
 ; CHECK-NOF16-NEXT:    ret;
   %r = call <2 x half> @llvm.fma.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
   ret <2 x half> %r
@@ -1869,20 +1833,19 @@ define <2 x half> @test_fabs(<2 x half> %a) #0 {
 ; CHECK-NOF16-LABEL: test_fabs(
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .b16 %rs<5>;
-; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
-; CHECK-NOF16-NEXT:    .reg .b32 %f<5>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fabs_param_0];
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
-; CHECK-NOF16-NEXT:    abs.f32 %f2, %f1;
-; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs3, %f2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs1;
-; CHECK-NOF16-NEXT:    abs.f32 %f4, %f3;
-; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs4, %f4;
-; CHECK-NOF16-NEXT:    mov.b32 %r2, {%rs4, %rs3};
-; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r2, %rs2;
+; CHECK-NOF16-NEXT:    abs.f32 %r3, %r2;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs3, %r3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs1;
+; CHECK-NOF16-NEXT:    abs.f32 %r5, %r4;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs4, %r5;
+; CHECK-NOF16-NEXT:    mov.b32 %r6, {%rs4, %rs3};
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r6;
 ; CHECK-NOF16-NEXT:    ret;
   %r = call <2 x half> @llvm.fabs.f16(<2 x half> %a)
   ret <2 x half> %r
@@ -1892,24 +1855,23 @@ define <2 x half> @test_minnum(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-LABEL: test_minnum(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<7>;
-; CHECK-NEXT:    .reg .b32 %r<4>;
-; CHECK-NEXT:    .reg .b32 %f<7>;
+; CHECK-NEXT:    .reg .b32 %r<10>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r2, [test_minnum_param_1];
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_minnum_param_0];
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NEXT:    cvt.f32.f16 %r3, %rs2;
 ; CHECK-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NEXT:    cvt.f32.f16 %f2, %rs4;
-; CHECK-NEXT:    min.f32 %f3, %f2, %f1;
-; CHECK-NEXT:    cvt.rn.f16.f32 %rs5, %f3;
-; CHECK-NEXT:    cvt.f32.f16 %f4, %rs1;
-; CHECK-NEXT:    cvt.f32.f16 %f5, %rs3;
-; CHECK-NEXT:    min.f32 %f6, %f5, %f4;
-; CHECK-NEXT:    cvt.rn.f16.f32 %rs6, %f6;
-; CHECK-NEXT:    mov.b32 %r3, {%rs6, %rs5};
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-NEXT:    min.f32 %r5, %r4, %r3;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs5, %r5;
+; CHECK-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NEXT:    cvt.f32.f16 %r7, %rs3;
+; CHECK-NEXT:    min.f32 %r8, %r7, %r6;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs6, %r8;
+; CHECK-NEXT:    mov.b32 %r9, {%rs6, %rs5};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r9;
 ; CHECK-NEXT:    ret;
   %r = call <2 x half> @llvm.minnum.f16(<2 x half> %a, <2 x half> %b)
   ret <2 x half> %r
@@ -1919,24 +1881,23 @@ define <2 x half> @test_maxnum(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-LABEL: test_maxnum(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<7>;
-; CHECK-NEXT:    .reg .b32 %r<4>;
-; CHECK-NEXT:    .reg .b32 %f<7>;
+; CHECK-NEXT:    .reg .b32 %r<10>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r2, [test_maxnum_param_1];
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_maxnum_param_0];
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NEXT:    cvt.f32.f16 %r3, %rs2;
 ; CHECK-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NEXT:    cvt.f32.f16 %f2, %rs4;
-; CHECK-NEXT:    max.f32 %f3, %f2, %f1;
-; CHECK-NEXT:    cvt.rn.f16.f32 %rs5, %f3;
-; CHECK-NEXT:    cvt.f32.f16 %f4, %rs1;
-; CHECK-NEXT:    cvt.f32.f16 %f5, %rs3;
-; CHECK-NEXT:    max.f32 %f6, %f5, %f4;
-; CHECK-NEXT:    cvt.rn.f16.f32 %rs6, %f6;
-; CHECK-NEXT:    mov.b32 %r3, {%rs6, %rs5};
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-NEXT:    max.f32 %r5, %r4, %r3;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs5, %r5;
+; CHECK-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NEXT:    cvt.f32.f16 %r7, %rs3;
+; CHECK-NEXT:    max.f32 %r8, %r7, %r6;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs6, %r8;
+; CHECK-NEXT:    mov.b32 %r9, {%rs6, %rs5};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r9;
 ; CHECK-NEXT:    ret;
   %r = call <2 x half> @llvm.maxnum.f16(<2 x half> %a, <2 x half> %b)
   ret <2 x half> %r
@@ -1983,43 +1944,41 @@ define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 {
 ; CHECK-F16-LABEL: test_copysign_f32(
 ; CHECK-F16:       {
 ; CHECK-F16-NEXT:    .reg .b16 %rs<3>;
-; CHECK-F16-NEXT:    .reg .b32 %r<6>;
-; CHECK-F16-NEXT:    .reg .b32 %f<3>;
+; CHECK-F16-NEXT:    .reg .b32 %r<8>;
 ; CHECK-F16-EMPTY:
 ; CHECK-F16-NEXT:  // %bb.0:
-; CHECK-F16-NEXT:    ld.param.v2.b32 {%f1, %f2}, [test_copysign_f32_param_1];
+; CHECK-F16-NEXT:    ld.param.v2.b32 {%r2, %r3}, [test_copysign_f32_param_1];
 ; CHECK-F16-NEXT:    ld.param.b32 %r1, [test_copysign_f32_param_0];
-; CHECK-F16-NEXT:    cvt.rn.f16.f32 %rs1, %f2;
-; CHECK-F16-NEXT:    cvt.rn.f16.f32 %rs2, %f1;
-; CHECK-F16-NEXT:    mov.b32 %r2, {%rs2, %rs1};
-; CHECK-F16-NEXT:    and.b32 %r3, %r2, -2147450880;
-; CHECK-F16-NEXT:    and.b32 %r4, %r1, 2147450879;
-; CHECK-F16-NEXT:    or.b32 %r5, %r4, %r3;
-; CHECK-F16-NEXT:    st.param.b32 [func_retval0], %r5;
+; CHECK-F16-NEXT:    cvt.rn.f16.f32 %rs1, %r3;
+; CHECK-F16-NEXT:    cvt.rn.f16.f32 %rs2, %r2;
+; CHECK-F16-NEXT:    mov.b32 %r4, {%rs2, %rs1};
+; CHECK-F16-NEXT:    and.b32 %r5, %r4, -2147450880;
+; CHECK-F16-NEXT:    and.b32 %r6, %r1, 2147450879;
+; CHECK-F16-NEXT:    or.b32 %r7, %r6, %r5;
+; CHECK-F16-NEXT:    st.param.b32 [func_retval0], %r7;
 ; CHECK-F16-NEXT:    ret;
 ;
 ; CHECK-NOF16-LABEL: test_copysign_f32(
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .b16 %rs<9>;
-; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
-; CHECK-NOF16-NEXT:    .reg .b32 %f<3>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<9>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b32 {%f1, %f2}, [test_copysign_f32_param_1];
+; CHECK-NOF16-NEXT:    ld.param.v2.b32 {%r2, %r3}, [test_copysign_f32_param_1];
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_copysign_f32_param_0];
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NOF16-NEXT:    and.b16 %rs3, %rs2, 32767;
-; CHECK-NOF16-NEXT:    mov.b32 %r2, %f2;
-; CHECK-NOF16-NEXT:    and.b32 %r3, %r2, -2147483648;
-; CHECK-NOF16-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r3; }
+; CHECK-NOF16-NEXT:    mov.b32 %r4, %r3;
+; CHECK-NOF16-NEXT:    and.b32 %r5, %r4, -2147483648;
+; CHECK-NOF16-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r5; }
 ; CHECK-NOF16-NEXT:    or.b16 %rs5, %rs3, %rs4;
 ; CHECK-NOF16-NEXT:    and.b16 %rs6, %rs1, 32767;
-; CHECK-NOF16-NEXT:    mov.b32 %r4, %f1;
-; CHECK-NOF16-NEXT:    and.b32 %r5, %r4, -2147483648;
-; CHECK-NOF16-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r5; }
+; CHECK-NOF16-NEXT:    mov.b32 %r6, %r2;
+; CHECK-NOF16-NEXT:    and.b32 %r7, %r6, -2147483648;
+; CHECK-NOF16-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r7; }
 ; CHECK-NOF16-NEXT:    or.b16 %rs8, %rs6, %rs7;
-; CHECK-NOF16-NEXT:    mov.b32 %r6, {%rs8, %rs5};
-; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r6;
+; CHECK-NOF16-NEXT:    mov.b32 %r8, {%rs8, %rs5};
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r8;
 ; CHECK-NOF16-NEXT:    ret;
   %tb = fptrunc <2 x float> %b to <2 x half>
   %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %tb)
@@ -2031,13 +1990,13 @@ define <2 x half> @test_copysign_f64(<2 x half> %a, <2 x double> %b) #0 {
 ; CHECK-F16:       {
 ; CHECK-F16-NEXT:    .reg .b16 %rs<3>;
 ; CHECK-F16-NEXT:    .reg .b32 %r<6>;
-; CHECK-F16-NEXT:    .reg .b64 %fd<3>;
+; CHECK-F16-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-F16-EMPTY:
 ; CHECK-F16-NEXT:  // %bb.0:
-; CHECK-F16-NEXT:    ld.param.v2.b64 {%fd1, %fd2}, [test_copysign_f64_param_1];
+; CHECK-F16-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [test_copysign_f64_param_1];
 ; CHECK-F16-NEXT:    ld.param.b32 %r1, [test_copysign_f64_param_0];
-; CHECK-F16-NEXT:    cvt.rn.f16.f64 %rs1, %fd2;
-; CHECK-F16-NEXT:    cvt.rn.f16.f64 %rs2, %fd1;
+; CHECK-F16-NEXT:    cvt.rn.f16.f64 %rs1, %rd2;
+; CHECK-F16-NEXT:    cvt.rn.f16.f64 %rs2, %rd1;
 ; CHECK-F16-NEXT:    mov.b32 %r2, {%rs2, %rs1};
 ; CHECK-F16-NEXT:    and.b32 %r3, %r2, -2147450880;
 ; CHECK-F16-NEXT:    and.b32 %r4, %r1, 2147450879;
@@ -2049,24 +2008,23 @@ define <2 x half> @test_copysign_f64(<2 x half> %a, <2 x double> %b) #0 {
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .b16 %rs<9>;
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
-; CHECK-NOF16-NEXT:    .reg .b64 %rd<7>;
-; CHECK-NOF16-NEXT:    .reg .b64 %fd<3>;
+; CHECK-NOF16-NEXT:    .reg .b64 %rd<9>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b64 {%fd1, %fd2}, [test_copysign_f64_param_1];
+; CHECK-NOF16-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [test_copysign_f64_param_1];
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_copysign_f64_param_0];
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NOF16-NEXT:    and.b16 %rs3, %rs2, 32767;
-; CHECK-NOF16-NEXT:    mov.b64 %rd1, %fd2;
-; CHECK-NOF16-NEXT:    and.b64 %rd2, %rd1, -9223372036854775808;
-; CHECK-NOF16-NEXT:    shr.u64 %rd3, %rd2, 48;
-; CHECK-NOF16-NEXT:    cvt.u16.u64 %rs4, %rd3;
+; CHECK-NOF16-NEXT:    mov.b64 %rd3, %rd2;
+; CHECK-NOF16-NEXT:    and.b64 %rd4, %rd3, -9223372036854775808;
+; CHECK-NOF16-NEXT:    shr.u64 %rd5, %rd4, 48;
+; CHECK-NOF16-NEXT:    cvt.u16.u64 %rs4, %rd5;
 ; CHECK-NOF16-NEXT:    or.b16 %rs5, %rs3, %rs4;
 ; CHECK-NOF16-NEXT:    and.b16 %rs6, %rs1, 32767;
-; CHECK-NOF16-NEXT:    mov.b64 %rd4, %fd1;
-; CHECK-NOF16-NEXT:    and.b64 %rd5, %rd4, -9223372036854775808;
-; CHECK-NOF16-NEXT:    shr.u64 %rd6, %rd5, 48;
-; CHECK-NOF16-NEXT:    cvt.u16.u64 %rs7, %rd6;
+; CHECK-NOF16-NEXT:    mov.b64 %rd6, %rd1;
+; CHECK-NOF16-NEXT:    and.b64 %rd7, %rd6, -9223372036854775808;
+; CHECK-NOF16-NEXT:    shr.u64 %rd8, %rd7, 48;
+; CHECK-NOF16-NEXT:    cvt.u16.u64 %rs7, %rd8;
 ; CHECK-NOF16-NEXT:    or.b16 %rs8, %rs6, %rs7;
 ; CHECK-NOF16-NEXT:    mov.b32 %r2, {%rs8, %rs5};
 ; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r2;
@@ -2080,8 +2038,7 @@ define <2 x float> @test_copysign_extended(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-F16-LABEL: test_copysign_extended(
 ; CHECK-F16:       {
 ; CHECK-F16-NEXT:    .reg .b16 %rs<3>;
-; CHECK-F16-NEXT:    .reg .b32 %r<6>;
-; CHECK-F16-NEXT:    .reg .b32 %f<3>;
+; CHECK-F16-NEXT:    .reg .b32 %r<8>;
 ; CHECK-F16-EMPTY:
 ; CHECK-F16-NEXT:  // %bb.0:
 ; CHECK-F16-NEXT:    ld.param.b32 %r2, [test_copysign_extended_param_1];
@@ -2090,16 +2047,15 @@ define <2 x float> @test_copysign_extended(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-F16-NEXT:    and.b32 %r4, %r1, 2147450879;
 ; CHECK-F16-NEXT:    or.b32 %r5, %r4, %r3;
 ; CHECK-F16-NEXT:    mov.b32 {%rs1, %rs2}, %r5;
-; CHECK-F16-NEXT:    cvt.f32.f16 %f1, %rs2;
-; CHECK-F16-NEXT:    cvt.f32.f16 %f2, %rs1;
-; CHECK-F16-NEXT:    st.param.v2.b32 [func_retval0], {%f2, %f1};
+; CHECK-F16-NEXT:    cvt.f32.f16 %r6, %rs2;
+; CHECK-F16-NEXT:    cvt.f32.f16 %r7, %rs1;
+; CHECK-F16-NEXT:    st.param.v2.b32 [func_retval0], {%r7, %r6};
 ; CHECK-F16-NEXT:    ret;
 ;
 ; CHECK-NOF16-LABEL: test_copysign_extended(
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .b16 %rs<11>;
-; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
-; CHECK-NOF16-NEXT:    .reg .b32 %f<3>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<5>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_copysign_extended_param_1];
@@ -2112,9 +2068,9 @@ define <2 x float> @test_copysign_extended(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    and.b16 %rs8, %rs2, -32768;
 ; CHECK-NOF16-NEXT:    and.b16 %rs9, %rs5, 32767;
 ; CHECK-NOF16-NEXT:    or.b16 %rs10, %rs9, %rs8;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs10;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs7;
-; CHECK-NOF16-NEXT:    st.param.v2.b32 [func_retval0], {%f2, %f1};
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs10;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs7;
+; CHECK-NOF16-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
 ; CHECK-NOF16-NEXT:    ret;
   %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b)
   %xr = fpext <2 x half> %r to <2 x float>
@@ -2235,42 +2191,41 @@ define <2 x half> @test_round(<2 x half> %a) #0 {
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<5>;
 ; CHECK-NEXT:    .reg .b16 %rs<5>;
-; CHECK-NEXT:    .reg .b32 %r<9>;
-; CHECK-NEXT:    .reg .b32 %f<17>;
+; CHECK-NEXT:    .reg .b32 %r<25>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_round_param_0];
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
-; CHECK-NEXT:    cvt.f32.f16 %f1, %rs2;
-; CHECK-NEXT:    mov.b32 %r2, %f1;
-; CHECK-NEXT:    and.b32 %r3, %r2, -2147483648;
-; CHECK-NEXT:    or.b32 %r4, %r3, 1056964608;
-; CHECK-NEXT:    mov.b32 %f2, %r4;
-; CHECK-NEXT:    add.rn.f32 %f3, %f1, %f2;
-; CHECK-NEXT:    cvt.rzi.f32.f32 %f4, %f3;
-; CHECK-NEXT:    abs.f32 %f5, %f1;
-; CHECK-NEXT:    setp.gt.f32 %p1, %f5, 0f4B000000;
-; CHECK-NEXT:    selp.f32 %f6, %f1, %f4, %p1;
-; CHECK-NEXT:    cvt.rzi.f32.f32 %f7, %f1;
-; CHECK-NEXT:    setp.lt.f32 %p2, %f5, 0f3F000000;
-; CHECK-NEXT:    selp.f32 %f8, %f7, %f6, %p2;
-; CHECK-NEXT:    cvt.rn.f16.f32 %rs3, %f8;
-; CHECK-NEXT:    cvt.f32.f16 %f9, %rs1;
-; CHECK-NEXT:    mov.b32 %r5, %f9;
-; CHECK-NEXT:    and.b32 %r6, %r5, -2147483648;
-; CHECK-NEXT:    or.b32 %r7, %r6, 1056964608;
-; CHECK-NEXT:    mov.b32 %f10, %r7;
-; CHECK-NEXT:    add.rn.f32 %f11, %f9, %f10;
-; CHECK-NEXT:    cvt.rzi.f32.f32 %f12, %f11;
-; CHECK-NEXT:    abs.f32 %f13, %f9;
-; CHECK-NEXT:    setp.gt.f32 %p3, %f13, 0f4B000000;
-; CHECK-NEXT:    selp.f32 %f14, %f9, %f12, %p3;
-; CHECK-NEXT:    cvt.rzi.f32.f32 %f15, %f9;
-; CHECK-NEXT:    setp.lt.f32 %p4, %f13, 0f3F000000;
-; CHECK-NEXT:    selp.f32 %f16, %f15, %f14, %p4;
-; CHECK-NEXT:    cvt.rn.f16.f32 %rs4, %f16;
-; CHECK-NEXT:    mov.b32 %r8, {%rs4, %rs3};
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r8;
+; CHECK-NEXT:    cvt.f32.f16 %r2, %rs2;
+; CHECK-NEXT:    mov.b32 %r3, %r2;
+; CHECK-NEXT:    and.b32 %r4, %r3, -2147483648;
+; CHECK-NEXT:    or.b32 %r5, %r4, 1056964608;
+; CHECK-NEXT:    mov.b32 %r6, %r5;
+; CHECK-NEXT:    add.rn.f32 %r7, %r2, %r6;
+; CHECK-NEXT:    cvt.rzi.f32.f32 %r8, %r7;
+; CHECK-NEXT:    abs.f32 %r9, %r2;
+; CHECK-NEXT:    setp.gt.f32 %p1, %r9, 0f4B000000;
+; CHECK-NEXT:    selp.f32 %r10, %r2, %r8, %p1;
+; CHECK-NEXT:    cvt.rzi.f32.f32 %r11, %r2;
+; CHECK-NEXT:    setp.lt.f32 %p2, %r9, 0f3F000000;
+; CHECK-NEXT:    selp.f32 %r12, %r11, %r10, %p2;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs3, %r12;
+; CHECK-NEXT:    cvt.f32.f16 %r13, %rs1;
+; CHECK-NEXT:    mov.b32 %r14, %r13;
+; CHECK-NEXT:    and.b32 %r15, %r14, -2147483648;
+; CHECK-NEXT:    or.b32 %r16, %r15, 1056964608;
+; CHECK-NEXT:    mov.b32 %r17, %r16;
+; CHECK-NEXT:    add.rn.f32 %r18, %r13, %r17;
+; CHECK-NEXT:    cvt.rzi.f32.f32 %r19, %r18;
+; CHECK-NEXT:    abs.f32 %r20, %r13;
+; CHECK-NEXT:    setp.gt.f32 %p3, %r20, 0f4B000000;
+; CHECK-NEXT:    selp.f32 %r21, %r13, %r19, %p3;
+; CHECK-NEXT:    cvt.rzi.f32.f32 %r22, %r13;
+; CHECK-NEXT:    setp.lt.f32 %p4, %r20, 0f3F000000;
+; CHECK-NEXT:    selp.f32 %r23, %r22, %r21, %p4;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs4, %r23;
+; CHECK-NEXT:    mov.b32 %r24, {%rs4, %rs3};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r24;
 ; CHECK-NEXT:    ret;
   %r = call <2 x half> @llvm.round.f16(<2 x half> %a)
   ret <2 x half> %r
@@ -2292,28 +2247,27 @@ define <2 x half> @test_fmuladd(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0
 ; CHECK-NOF16-LABEL: test_fmuladd(
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .b16 %rs<9>;
-; CHECK-NOF16-NEXT:    .reg .b32 %r<5>;
-; CHECK-NOF16-NEXT:    .reg .b32 %f<9>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<13>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r3, [test_fmuladd_param_2];
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fmuladd_param_1];
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fmuladd_param_0];
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs4;
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs6;
-; CHECK-NOF16-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
-; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs7, %f4;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f5, %rs1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f6, %rs3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f7, %rs5;
-; CHECK-NOF16-NEXT:    fma.rn.f32 %f8, %f7, %f6, %f5;
-; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs8, %f8;
-; CHECK-NOF16-NEXT:    mov.b32 %r4, {%rs8, %rs7};
-; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r4;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs6;
+; CHECK-NOF16-NEXT:    fma.rn.f32 %r7, %r6, %r5, %r4;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs7, %r7;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r8, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r9, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r10, %rs5;
+; CHECK-NOF16-NEXT:    fma.rn.f32 %r11, %r10, %r9, %r8;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs8, %r11;
+; CHECK-NOF16-NEXT:    mov.b32 %r12, {%rs8, %rs7};
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r12;
 ; CHECK-NOF16-NEXT:    ret;
   %r = call <2 x half> @llvm.fmuladd.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
   ret <2 x half> %r
diff --git a/llvm/test/CodeGen/NVPTX/f32-ex2.ll b/llvm/test/CodeGen/NVPTX/f32-ex2.ll
index 939782eccff55..fd92375eb7b77 100644
--- a/llvm/test/CodeGen/NVPTX/f32-ex2.ll
+++ b/llvm/test/CodeGen/NVPTX/f32-ex2.ll
@@ -9,12 +9,12 @@ declare float @llvm.nvvm.ex2.approx.f(float)
 define float @ex2_float(float %0) {
 ; CHECK-LABEL: ex2_float(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [ex2_float_param_0];
-; CHECK-NEXT:    ex2.approx.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-NEXT:    ld.param.b32 %r1, [ex2_float_param_0];
+; CHECK-NEXT:    ex2.approx.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %res = call float @llvm.nvvm.ex2.approx.f(float %0)
   ret float %res
@@ -24,12 +24,12 @@ define float @ex2_float(float %0) {
 define float @ex2_float_ftz(float %0) {
 ; CHECK-LABEL: ex2_float_ftz(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [ex2_float_ftz_param_0];
-; CHECK-NEXT:    ex2.approx.ftz.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-NEXT:    ld.param.b32 %r1, [ex2_float_ftz_param_0];
+; CHECK-NEXT:    ex2.approx.ftz.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %res = call float @llvm.nvvm.ex2.approx.ftz.f(float %0)
   ret float %res
diff --git a/llvm/test/CodeGen/NVPTX/f32-lg2.ll b/llvm/test/CodeGen/NVPTX/f32-lg2.ll
index 2b101bc3af43a..29dede097610d 100644
--- a/llvm/test/CodeGen/NVPTX/f32-lg2.ll
+++ b/llvm/test/CodeGen/NVPTX/f32-lg2.ll
@@ -10,12 +10,12 @@ declare float @llvm.nvvm.lg2.approx.ftz.f(float)
 define float @lg2_float(float %0) {
 ; CHECK-LABEL: lg2_float(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [lg2_float_param_0];
-; CHECK-NEXT:    lg2.approx.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-NEXT:    ld.param.b32 %r1, [lg2_float_param_0];
+; CHECK-NEXT:    lg2.approx.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %res = call float @llvm.nvvm.lg2.approx.f(float %0)
   ret float %res
@@ -25,12 +25,12 @@ define float @lg2_float(float %0) {
 define float @lg2_float_ftz(float %0) {
 ; CHECK-LABEL: lg2_float_ftz(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [lg2_float_ftz_param_0];
-; CHECK-NEXT:    lg2.approx.ftz.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-NEXT:    ld.param.b32 %r1, [lg2_float_ftz_param_0];
+; CHECK-NEXT:    lg2.approx.ftz.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %res = call float @llvm.nvvm.lg2.approx.ftz.f(float %0)
   ret float %res
diff --git a/llvm/test/CodeGen/NVPTX/fabs-intrinsics.ll b/llvm/test/CodeGen/NVPTX/fabs-intrinsics.ll
index 51434f7566c14..30f9dcc27edbe 100644
--- a/llvm/test/CodeGen/NVPTX/fabs-intrinsics.ll
+++ b/llvm/test/CodeGen/NVPTX/fabs-intrinsics.ll
@@ -18,12 +18,12 @@ declare <2 x bfloat> @llvm.nvvm.fabs.v2bf16(<2 x bfloat>)
 define float @fabs_float(float %a) {
 ; CHECK-LABEL: fabs_float(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [fabs_float_param_0];
-; CHECK-NEXT:    abs.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-NEXT:    ld.param.b32 %r1, [fabs_float_param_0];
+; CHECK-NEXT:    abs.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %ret = call float @llvm.nvvm.fabs.f32(float %a)
   ret float %ret
@@ -32,12 +32,12 @@ define float @fabs_float(float %a) {
 define float @fabs_float_ftz(float %a) {
 ; CHECK-LABEL: fabs_float_ftz(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [fabs_float_ftz_param_0];
-; CHECK-NEXT:    abs.ftz.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-NEXT:    ld.param.b32 %r1, [fabs_float_ftz_param_0];
+; CHECK-NEXT:    abs.ftz.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %ret = call float @llvm.nvvm.fabs.ftz.f32(float %a)
   ret float %ret
@@ -46,12 +46,12 @@ define float @fabs_float_ftz(float %a) {
 define double @fabs_double(double %a) {
 ; CHECK-LABEL: fabs_double(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %fd<3>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b64 %fd1, [fabs_double_param_0];
-; CHECK-NEXT:    abs.f64 %fd2, %fd1;
-; CHECK-NEXT:    st.param.b64 [func_retval0], %fd2;
+; CHECK-NEXT:    ld.param.b64 %rd1, [fabs_double_param_0];
+; CHECK-NEXT:    abs.f64 %rd2, %rd1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
 ; CHECK-NEXT:    ret;
   %ret = call double @llvm.nvvm.fabs.f64(double %a)
   ret double %ret
diff --git a/llvm/test/CodeGen/NVPTX/fexp2.ll b/llvm/test/CodeGen/NVPTX/fexp2.ll
index c8940d9ae2a90..c3212954668e2 100644
--- a/llvm/test/CodeGen/NVPTX/fexp2.ll
+++ b/llvm/test/CodeGen/NVPTX/fexp2.ll
@@ -13,32 +13,32 @@ target triple = "nvptx64-nvidia-cuda"
 define float @exp2_test(float %in) {
 ; CHECK-LABEL: exp2_test(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    ld.param.b32 %f1, [exp2_test_param_0];
-; CHECK-NEXT:    ex2.approx.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-NEXT:    ld.param.b32 %r1, [exp2_test_param_0];
+; CHECK-NEXT:    ex2.approx.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
 ;
 ; CHECK-FP16-LABEL: exp2_test(
 ; CHECK-FP16:       {
-; CHECK-FP16-NEXT:    .reg .b32 %f<3>;
+; CHECK-FP16-NEXT:    .reg .b32 %r<3>;
 ; CHECK-FP16-EMPTY:
 ; CHECK-FP16-NEXT:  // %bb.0: // %entry
-; CHECK-FP16-NEXT:    ld.param.b32 %f1, [exp2_test_param_0];
-; CHECK-FP16-NEXT:    ex2.approx.f32 %f2, %f1;
-; CHECK-FP16-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-FP16-NEXT:    ld.param.b32 %r1, [exp2_test_param_0];
+; CHECK-FP16-NEXT:    ex2.approx.f32 %r2, %r1;
+; CHECK-FP16-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-FP16-NEXT:    ret;
 ;
 ; CHECK-BF16-LABEL: exp2_test(
 ; CHECK-BF16:       {
-; CHECK-BF16-NEXT:    .reg .b32 %f<3>;
+; CHECK-BF16-NEXT:    .reg .b32 %r<3>;
 ; CHECK-BF16-EMPTY:
 ; CHECK-BF16-NEXT:  // %bb.0: // %entry
-; CHECK-BF16-NEXT:    ld.param.b32 %f1, [exp2_test_param_0];
-; CHECK-BF16-NEXT:    ex2.approx.f32 %f2, %f1;
-; CHECK-BF16-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-BF16-NEXT:    ld.param.b32 %r1, [exp2_test_param_0];
+; CHECK-BF16-NEXT:    ex2.approx.f32 %r2, %r1;
+; CHECK-BF16-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-BF16-NEXT:    ret;
 entry:
   %exp2 = call float @llvm.exp2.f32(float %in)
@@ -49,32 +49,32 @@ entry:
 define float @exp2_ftz_test(float %in) #0 {
 ; CHECK-LABEL: exp2_ftz_test(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    ld.param.b32 %f1, [exp2_ftz_test_param_0];
-; CHECK-NEXT:    ex2.approx.ftz.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-NEXT:    ld.param.b32 %r1, [exp2_ftz_test_param_0];
+; CHECK-NEXT:    ex2.approx.ftz.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
 ;
 ; CHECK-FP16-LABEL: exp2_ftz_test(
 ; CHECK-FP16:       {
-; CHECK-FP16-NEXT:    .reg .b32 %f<3>;
+; CHECK-FP16-NEXT:    .reg .b32 %r<3>;
 ; CHECK-FP16-EMPTY:
 ; CHECK-FP16-NEXT:  // %bb.0: // %entry
-; CHECK-FP16-NEXT:    ld.param.b32 %f1, [exp2_ftz_test_param_0];
-; CHECK-FP16-NEXT:    ex2.approx.ftz.f32 %f2, %f1;
-; CHECK-FP16-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-FP16-NEXT:    ld.param.b32 %r1, [exp2_ftz_test_param_0];
+; CHECK-FP16-NEXT:    ex2.approx.ftz.f32 %r2, %r1;
+; CHECK-FP16-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-FP16-NEXT:    ret;
 ;
 ; CHECK-BF16-LABEL: exp2_ftz_test(
 ; CHECK-BF16:       {
-; CHECK-BF16-NEXT:    .reg .b32 %f<3>;
+; CHECK-BF16-NEXT:    .reg .b32 %r<3>;
 ; CHECK-BF16-EMPTY:
 ; CHECK-BF16-NEXT:  // %bb.0: // %entry
-; CHECK-BF16-NEXT:    ld.param.b32 %f1, [exp2_ftz_test_param_0];
-; CHECK-BF16-NEXT:    ex2.approx.ftz.f32 %f2, %f1;
-; CHECK-BF16-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-BF16-NEXT:    ld.param.b32 %r1, [exp2_ftz_test_param_0];
+; CHECK-BF16-NEXT:    ex2.approx.ftz.f32 %r2, %r1;
+; CHECK-BF16-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-BF16-NEXT:    ret;
 entry:
   %exp2 = call float @llvm.exp2.f32(float %in)
@@ -85,35 +85,35 @@ entry:
 define <2 x float> @exp2_test_v(<2 x float> %in) {
 ; CHECK-LABEL: exp2_test_v(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    ld.param.v2.b32 {%f1, %f2}, [exp2_test_v_param_0];
-; CHECK-NEXT:    ex2.approx.f32 %f3, %f2;
-; CHECK-NEXT:    ex2.approx.f32 %f4, %f1;
-; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%f4, %f3};
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [exp2_test_v_param_0];
+; CHECK-NEXT:    ex2.approx.f32 %r3, %r2;
+; CHECK-NEXT:    ex2.approx.f32 %r4, %r1;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
 ; CHECK-NEXT:    ret;
 ;
 ; CHECK-FP16-LABEL: exp2_test_v(
 ; CHECK-FP16:       {
-; CHECK-FP16-NEXT:    .reg .b32 %f<5>;
+; CHECK-FP16-NEXT:    .reg .b32 %r<5>;
 ; CHECK-FP16-EMPTY:
 ; CHECK-FP16-NEXT:  // %bb.0: // %entry
-; CHECK-FP16-NEXT:    ld.param.v2.b32 {%f1, %f2}, [exp2_test_v_param_0];
-; CHECK-FP16-NEXT:    ex2.approx.f32 %f3, %f2;
-; CHECK-FP16-NEXT:    ex2.approx.f32 %f4, %f1;
-; CHECK-FP16-NEXT:    st.param.v2.b32 [func_retval0], {%f4, %f3};
+; CHECK-FP16-NEXT:    ld.param.v2.b32 {%r1, %r2}, [exp2_test_v_param_0];
+; CHECK-FP16-NEXT:    ex2.approx.f32 %r3, %r2;
+; CHECK-FP16-NEXT:    ex2.approx.f32 %r4, %r1;
+; CHECK-FP16-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
 ; CHECK-FP16-NEXT:    ret;
 ;
 ; CHECK-BF16-LABEL: exp2_test_v(
 ; CHECK-BF16:       {
-; CHECK-BF16-NEXT:    .reg .b32 %f<5>;
+; CHECK-BF16-NEXT:    .reg .b32 %r<5>;
 ; CHECK-BF16-EMPTY:
 ; CHECK-BF16-NEXT:  // %bb.0: // %entry
-; CHECK-BF16-NEXT:    ld.param.v2.b32 {%f1, %f2}, [exp2_test_v_param_0];
-; CHECK-BF16-NEXT:    ex2.approx.f32 %f3, %f2;
-; CHECK-BF16-NEXT:    ex2.approx.f32 %f4, %f1;
-; CHECK-BF16-NEXT:    st.param.v2.b32 [func_retval0], {%f4, %f3};
+; CHECK-BF16-NEXT:    ld.param.v2.b32 {%r1, %r2}, [exp2_test_v_param_0];
+; CHECK-BF16-NEXT:    ex2.approx.f32 %r3, %r2;
+; CHECK-BF16-NEXT:    ex2.approx.f32 %r4, %r1;
+; CHECK-BF16-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
 ; CHECK-BF16-NEXT:    ret;
 entry:
   %exp2 = call <2 x float> @llvm.exp2.v2f32(<2 x float> %in)
@@ -127,13 +127,13 @@ define half @exp2_f16_test(half %in) {
 ; CHECK-LABEL: exp2_f16_test(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<3>;
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    ld.param.b16 %rs1, [exp2_f16_test_param_0];
-; CHECK-NEXT:    cvt.f32.f16 %f1, %rs1;
-; CHECK-NEXT:    ex2.approx.f32 %f2, %f1;
-; CHECK-NEXT:    cvt.rn.f16.f32 %rs2, %f2;
+; CHECK-NEXT:    cvt.f32.f16 %r1, %rs1;
+; CHECK-NEXT:    ex2.approx.f32 %r2, %r1;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs2, %r2;
 ; CHECK-NEXT:    st.param.b16 [func_retval0], %rs2;
 ; CHECK-NEXT:    ret;
 ;
@@ -167,13 +167,13 @@ define half @exp2_f16_ftz_test(half %in) #0 {
 ; CHECK-LABEL: exp2_f16_ftz_test(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<3>;
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    ld.param.b16 %rs1, [exp2_f16_ftz_test_param_0];
-; CHECK-NEXT:    cvt.ftz.f32.f16 %f1, %rs1;
-; CHECK-NEXT:    ex2.approx.ftz.f32 %f2, %f1;
-; CHECK-NEXT:    cvt.rn.f16.f32 %rs2, %f2;
+; CHECK-NEXT:    cvt.ftz.f32.f16 %r1, %rs1;
+; CHECK-NEXT:    ex2.approx.ftz.f32 %r2, %r1;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs2, %r2;
 ; CHECK-NEXT:    st.param.b16 [func_retval0], %rs2;
 ; CHECK-NEXT:    ret;
 ;
@@ -206,20 +206,19 @@ define <2 x half> @exp2_f16_test_v(<2 x half> %in) {
 ; CHECK-LABEL: exp2_f16_test_v(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<5>;
-; CHECK-NEXT:    .reg .b32 %r<3>;
-; CHECK-NEXT:    .reg .b32 %f<5>;
+; CHECK-NEXT:    .reg .b32 %r<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    ld.param.b32 %r1, [exp2_f16_test_v_param_0];
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
-; CHECK-NEXT:    cvt.f32.f16 %f1, %rs2;
-; CHECK-NEXT:    ex2.approx.f32 %f2, %f1;
-; CHECK-NEXT:    cvt.rn.f16.f32 %rs3, %f2;
-; CHECK-NEXT:    cvt.f32.f16 %f3, %rs1;
-; CHECK-NEXT:    ex2.approx.f32 %f4, %f3;
-; CHECK-NEXT:    cvt.rn.f16.f32 %rs4, %f4;
-; CHECK-NEXT:    mov.b32 %r2, {%rs4, %rs3};
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    cvt.f32.f16 %r2, %rs2;
+; CHECK-NEXT:    ex2.approx.f32 %r3, %r2;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs3, %r3;
+; CHECK-NEXT:    cvt.f32.f16 %r4, %rs1;
+; CHECK-NEXT:    ex2.approx.f32 %r5, %r4;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs4, %r5;
+; CHECK-NEXT:    mov.b32 %r6, {%rs4, %rs3};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
 ; CHECK-NEXT:    ret;
 ;
 ; CHECK-FP16-LABEL: exp2_f16_test_v(
@@ -255,22 +254,19 @@ define bfloat @exp2_bf16_test(bfloat %in) {
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<2>;
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
-; CHECK-NEXT:    .reg .b32 %r<9>;
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<13>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    ld.param.b16 %r1, [exp2_bf16_test_param_0];
-; CHECK-NEXT:    shl.b32 %r2, %r1, 16;
-; CHECK-NEXT:    mov.b32 %f1, %r2;
-; CHECK-NEXT:    ex2.approx.f32 %f2, %f1;
-; CHECK-NEXT:    mov.b32 %r3, %f2;
-; CHECK-NEXT:    bfe.u32 %r4, %r3, 16, 1;
-; CHECK-NEXT:    add.s32 %r5, %r4, %r3;
-; CHECK-NEXT:    add.s32 %r6, %r5, 32767;
-; CHECK-NEXT:    setp.nan.f32 %p1, %f2, %f2;
-; CHECK-NEXT:    or.b32 %r7, %r3, 4194304;
-; CHECK-NEXT:    selp.b32 %r8, %r7, %r6, %p1;
-; CHECK-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r8; }
+; CHECK-NEXT:    shl.b32 %r11, %r1, 16;
+; CHECK-NEXT:    ex2.approx.f32 %r12, %r11;
+; CHECK-NEXT:    bfe.u32 %r6, %r12, 16, 1;
+; CHECK-NEXT:    add.s32 %r7, %r6, %r12;
+; CHECK-NEXT:    add.s32 %r8, %r7, 32767;
+; CHECK-NEXT:    setp.nan.f32 %p1, %r12, %r12;
+; CHECK-NEXT:    or.b32 %r9, %r12, 4194304;
+; CHECK-NEXT:    selp.b32 %r10, %r9, %r8, %p1;
+; CHECK-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r10; }
 ; CHECK-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; CHECK-NEXT:    ret;
 ;
@@ -278,22 +274,19 @@ define bfloat @exp2_bf16_test(bfloat %in) {
 ; CHECK-FP16:       {
 ; CHECK-FP16-NEXT:    .reg .pred %p<2>;
 ; CHECK-FP16-NEXT:    .reg .b16 %rs<2>;
-; CHECK-FP16-NEXT:    .reg .b32 %r<9>;
-; CHECK-FP16-NEXT:    .reg .b32 %f<3>;
+; CHECK-FP16-NEXT:    .reg .b32 %r<13>;
 ; CHECK-FP16-EMPTY:
 ; CHECK-FP16-NEXT:  // %bb.0: // %entry
 ; CHECK-FP16-NEXT:    ld.param.b16 %r1, [exp2_bf16_test_param_0];
-; CHECK-FP16-NEXT:    shl.b32 %r2, %r1, 16;
-; CHECK-FP16-NEXT:    mov.b32 %f1, %r2;
-; CHECK-FP16-NEXT:    ex2.approx.f32 %f2, %f1;
-; CHECK-FP16-NEXT:    mov.b32 %r3, %f2;
-; CHECK-FP16-NEXT:    bfe.u32 %r4, %r3, 16, 1;
-; CHECK-FP16-NEXT:    add.s32 %r5, %r4, %r3;
-; CHECK-FP16-NEXT:    add.s32 %r6, %r5, 32767;
-; CHECK-FP16-NEXT:    setp.nan.f32 %p1, %f2, %f2;
-; CHECK-FP16-NEXT:    or.b32 %r7, %r3, 4194304;
-; CHECK-FP16-NEXT:    selp.b32 %r8, %r7, %r6, %p1;
-; CHECK-FP16-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r8; }
+; CHECK-FP16-NEXT:    shl.b32 %r11, %r1, 16;
+; CHECK-FP16-NEXT:    ex2.approx.f32 %r12, %r11;
+; CHECK-FP16-NEXT:    bfe.u32 %r6, %r12, 16, 1;
+; CHECK-FP16-NEXT:    add.s32 %r7, %r6, %r12;
+; CHECK-FP16-NEXT:    add.s32 %r8, %r7, 32767;
+; CHECK-FP16-NEXT:    setp.nan.f32 %p1, %r12, %r12;
+; CHECK-FP16-NEXT:    or.b32 %r9, %r12, 4194304;
+; CHECK-FP16-NEXT:    selp.b32 %r10, %r9, %r8, %p1;
+; CHECK-FP16-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r10; }
 ; CHECK-FP16-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; CHECK-FP16-NEXT:    ret;
 ;
@@ -317,72 +310,62 @@ define <2 x bfloat> @exp2_bf16_test_v(<2 x bfloat> %in) {
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<3>;
 ; CHECK-NEXT:    .reg .b16 %rs<3>;
-; CHECK-NEXT:    .reg .b32 %r<19>;
-; CHECK-NEXT:    .reg .b32 %f<5>;
+; CHECK-NEXT:    .reg .b32 %r<27>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    ld.param.b32 %r1, [exp2_bf16_test_v_param_0];
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    cvt.u32.u16 %r2, %rs2;
-; CHECK-NEXT:    shl.b32 %r3, %r2, 16;
-; CHECK-NEXT:    mov.b32 %f1, %r3;
-; CHECK-NEXT:    ex2.approx.f32 %f2, %f1;
-; CHECK-NEXT:    mov.b32 %r4, %f2;
-; CHECK-NEXT:    bfe.u32 %r5, %r4, 16, 1;
-; CHECK-NEXT:    add.s32 %r6, %r5, %r4;
-; CHECK-NEXT:    add.s32 %r7, %r6, 32767;
-; CHECK-NEXT:    setp.nan.f32 %p1, %f2, %f2;
-; CHECK-NEXT:    or.b32 %r8, %r4, 4194304;
-; CHECK-NEXT:    selp.b32 %r9, %r8, %r7, %p1;
-; CHECK-NEXT:    cvt.u32.u16 %r10, %rs1;
-; CHECK-NEXT:    shl.b32 %r11, %r10, 16;
-; CHECK-NEXT:    mov.b32 %f3, %r11;
-; CHECK-NEXT:    ex2.approx.f32 %f4, %f3;
-; CHECK-NEXT:    mov.b32 %r12, %f4;
-; CHECK-NEXT:    bfe.u32 %r13, %r12, 16, 1;
-; CHECK-NEXT:    add.s32 %r14, %r13, %r12;
-; CHECK-NEXT:    add.s32 %r15, %r14, 32767;
-; CHECK-NEXT:    setp.nan.f32 %p2, %f4, %f4;
-; CHECK-NEXT:    or.b32 %r16, %r12, 4194304;
-; CHECK-NEXT:    selp.b32 %r17, %r16, %r15, %p2;
-; CHECK-NEXT:    prmt.b32 %r18, %r17, %r9, 0x7632U;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r18;
+; CHECK-NEXT:    shl.b32 %r23, %r2, 16;
+; CHECK-NEXT:    ex2.approx.f32 %r24, %r23;
+; CHECK-NEXT:    bfe.u32 %r7, %r24, 16, 1;
+; CHECK-NEXT:    add.s32 %r8, %r7, %r24;
+; CHECK-NEXT:    add.s32 %r9, %r8, 32767;
+; CHECK-NEXT:    setp.nan.f32 %p1, %r24, %r24;
+; CHECK-NEXT:    or.b32 %r10, %r24, 4194304;
+; CHECK-NEXT:    selp.b32 %r11, %r10, %r9, %p1;
+; CHECK-NEXT:    cvt.u32.u16 %r12, %rs1;
+; CHECK-NEXT:    shl.b32 %r25, %r12, 16;
+; CHECK-NEXT:    ex2.approx.f32 %r26, %r25;
+; CHECK-NEXT:    bfe.u32 %r17, %r26, 16, 1;
+; CHECK-NEXT:    add.s32 %r18, %r17, %r26;
+; CHECK-NEXT:    add.s32 %r19, %r18, 32767;
+; CHECK-NEXT:    setp.nan.f32 %p2, %r26, %r26;
+; CHECK-NEXT:    or.b32 %r20, %r26, 4194304;
+; CHECK-NEXT:    selp.b32 %r21, %r20, %r19, %p2;
+; CHECK-NEXT:    prmt.b32 %r22, %r21, %r11, 0x7632U;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r22;
 ; CHECK-NEXT:    ret;
 ;
 ; CHECK-FP16-LABEL: exp2_bf16_test_v(
 ; CHECK-FP16:       {
 ; CHECK-FP16-NEXT:    .reg .pred %p<3>;
 ; CHECK-FP16-NEXT:    .reg .b16 %rs<3>;
-; CHECK-FP16-NEXT:    .reg .b32 %r<19>;
-; CHECK-FP16-NEXT:    .reg .b32 %f<5>;
+; CHECK-FP16-NEXT:    .reg .b32 %r<27>;
 ; CHECK-FP16-EMPTY:
 ; CHECK-FP16-NEXT:  // %bb.0: // %entry
 ; CHECK-FP16-NEXT:    ld.param.b32 %r1, [exp2_bf16_test_v_param_0];
 ; CHECK-FP16-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-FP16-NEXT:    cvt.u32.u16 %r2, %rs2;
-; CHECK-FP16-NEXT:    shl.b32 %r3, %r2, 16;
-; CHECK-FP16-NEXT:    mov.b32 %f1, %r3;
-; CHECK-FP16-NEXT:    ex2.approx.f32 %f2, %f1;
-; CHECK-FP16-NEXT:    mov.b32 %r4, %f2;
-; CHECK-FP16-NEXT:    bfe.u32 %r5, %r4, 16, 1;
-; CHECK-FP16-NEXT:    add.s32 %r6, %r5, %r4;
-; CHECK-FP16-NEXT:    add.s32 %r7, %r6, 32767;
-; CHECK-FP16-NEXT:    setp.nan.f32 %p1, %f2, %f2;
-; CHECK-FP16-NEXT:    or.b32 %r8, %r4, 4194304;
-; CHECK-FP16-NEXT:    selp.b32 %r9, %r8, %r7, %p1;
-; CHECK-FP16-NEXT:    cvt.u32.u16 %r10, %rs1;
-; CHECK-FP16-NEXT:    shl.b32 %r11, %r10, 16;
-; CHECK-FP16-NEXT:    mov.b32 %f3, %r11;
-; CHECK-FP16-NEXT:    ex2.approx.f32 %f4, %f3;
-; CHECK-FP16-NEXT:    mov.b32 %r12, %f4;
-; CHECK-FP16-NEXT:    bfe.u32 %r13, %r12, 16, 1;
-; CHECK-FP16-NEXT:    add.s32 %r14, %r13, %r12;
-; CHECK-FP16-NEXT:    add.s32 %r15, %r14, 32767;
-; CHECK-FP16-NEXT:    setp.nan.f32 %p2, %f4, %f4;
-; CHECK-FP16-NEXT:    or.b32 %r16, %r12, 4194304;
-; CHECK-FP16-NEXT:    selp.b32 %r17, %r16, %r15, %p2;
-; CHECK-FP16-NEXT:    prmt.b32 %r18, %r17, %r9, 0x7632U;
-; CHECK-FP16-NEXT:    st.param.b32 [func_retval0], %r18;
+; CHECK-FP16-NEXT:    shl.b32 %r23, %r2, 16;
+; CHECK-FP16-NEXT:    ex2.approx.f32 %r24, %r23;
+; CHECK-FP16-NEXT:    bfe.u32 %r7, %r24, 16, 1;
+; CHECK-FP16-NEXT:    add.s32 %r8, %r7, %r24;
+; CHECK-FP16-NEXT:    add.s32 %r9, %r8, 32767;
+; CHECK-FP16-NEXT:    setp.nan.f32 %p1, %r24, %r24;
+; CHECK-FP16-NEXT:    or.b32 %r10, %r24, 4194304;
+; CHECK-FP16-NEXT:    selp.b32 %r11, %r10, %r9, %p1;
+; CHECK-FP16-NEXT:    cvt.u32.u16 %r12, %rs1;
+; CHECK-FP16-NEXT:    shl.b32 %r25, %r12, 16;
+; CHECK-FP16-NEXT:    ex2.approx.f32 %r26, %r25;
+; CHECK-FP16-NEXT:    bfe.u32 %r17, %r26, 16, 1;
+; CHECK-FP16-NEXT:    add.s32 %r18, %r17, %r26;
+; CHECK-FP16-NEXT:    add.s32 %r19, %r18, 32767;
+; CHECK-FP16-NEXT:    setp.nan.f32 %p2, %r26, %r26;
+; CHECK-FP16-NEXT:    or.b32 %r20, %r26, 4194304;
+; CHECK-FP16-NEXT:    selp.b32 %r21, %r20, %r19, %p2;
+; CHECK-FP16-NEXT:    prmt.b32 %r22, %r21, %r11, 0x7632U;
+; CHECK-FP16-NEXT:    st.param.b32 [func_retval0], %r22;
 ; CHECK-FP16-NEXT:    ret;
 ;
 ; CHECK-BF16-LABEL: exp2_bf16_test_v(
diff --git a/llvm/test/CodeGen/NVPTX/flog2.ll b/llvm/test/CodeGen/NVPTX/flog2.ll
index d922e18edc165..c672af2893da1 100644
--- a/llvm/test/CodeGen/NVPTX/flog2.ll
+++ b/llvm/test/CodeGen/NVPTX/flog2.ll
@@ -7,12 +7,12 @@ target triple = "nvptx64-nvidia-cuda"
 define float @log2_test(float %in) {
 ; CHECK-LABEL: log2_test(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    ld.param.b32 %f1, [log2_test_param_0];
-; CHECK-NEXT:    lg2.approx.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-NEXT:    ld.param.b32 %r1, [log2_test_param_0];
+; CHECK-NEXT:    lg2.approx.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
 entry:
   %log2 = call float @llvm.log2.f32(float %in)
@@ -23,12 +23,12 @@ entry:
 define float @log2_ftz_test(float %in) #0 {
 ; CHECK-LABEL: log2_ftz_test(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    ld.param.b32 %f1, [log2_ftz_test_param_0];
-; CHECK-NEXT:    lg2.approx.ftz.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-NEXT:    ld.param.b32 %r1, [log2_ftz_test_param_0];
+; CHECK-NEXT:    lg2.approx.ftz.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
 entry:
   %log2 = call float @llvm.log2.f32(float %in)
@@ -39,13 +39,13 @@ entry:
 define <2 x float> @log2_test_v(<2 x float> %in) {
 ; CHECK-LABEL: log2_test_v(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    ld.param.v2.b32 {%f1, %f2}, [log2_test_v_param_0];
-; CHECK-NEXT:    lg2.approx.f32 %f3, %f2;
-; CHECK-NEXT:    lg2.approx.f32 %f4, %f1;
-; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%f4, %f3};
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [log2_test_v_param_0];
+; CHECK-NEXT:    lg2.approx.f32 %r3, %r2;
+; CHECK-NEXT:    lg2.approx.f32 %r4, %r1;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
 ; CHECK-NEXT:    ret;
 entry:
   %log2 = call <2 x float> @llvm.log2.v2f32(<2 x float> %in)
@@ -59,13 +59,13 @@ define half @log2_f16_test(half %in) {
 ; CHECK-LABEL: log2_f16_test(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<3>;
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    ld.param.b16 %rs1, [log2_f16_test_param_0];
-; CHECK-NEXT:    cvt.f32.f16 %f1, %rs1;
-; CHECK-NEXT:    lg2.approx.f32 %f2, %f1;
-; CHECK-NEXT:    cvt.rn.f16.f32 %rs2, %f2;
+; CHECK-NEXT:    cvt.f32.f16 %r1, %rs1;
+; CHECK-NEXT:    lg2.approx.f32 %r2, %r1;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs2, %r2;
 ; CHECK-NEXT:    st.param.b16 [func_retval0], %rs2;
 ; CHECK-NEXT:    ret;
 entry:
@@ -78,13 +78,13 @@ define half @log2_f16_ftz_test(half %in) #0 {
 ; CHECK-LABEL: log2_f16_ftz_test(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<3>;
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    ld.param.b16 %rs1, [log2_f16_ftz_test_param_0];
-; CHECK-NEXT:    cvt.ftz.f32.f16 %f1, %rs1;
-; CHECK-NEXT:    lg2.approx.ftz.f32 %f2, %f1;
-; CHECK-NEXT:    cvt.rn.f16.f32 %rs2, %f2;
+; CHECK-NEXT:    cvt.ftz.f32.f16 %r1, %rs1;
+; CHECK-NEXT:    lg2.approx.ftz.f32 %r2, %r1;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs2, %r2;
 ; CHECK-NEXT:    st.param.b16 [func_retval0], %rs2;
 ; CHECK-NEXT:    ret;
 entry:
@@ -97,20 +97,19 @@ define <2 x half> @log2_f16_test_v(<2 x half> %in) {
 ; CHECK-LABEL: log2_f16_test_v(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<5>;
-; CHECK-NEXT:    .reg .b32 %r<3>;
-; CHECK-NEXT:    .reg .b32 %f<5>;
+; CHECK-NEXT:    .reg .b32 %r<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    ld.param.b32 %r1, [log2_f16_test_v_param_0];
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
-; CHECK-NEXT:    cvt.f32.f16 %f1, %rs2;
-; CHECK-NEXT:    lg2.approx.f32 %f2, %f1;
-; CHECK-NEXT:    cvt.rn.f16.f32 %rs3, %f2;
-; CHECK-NEXT:    cvt.f32.f16 %f3, %rs1;
-; CHECK-NEXT:    lg2.approx.f32 %f4, %f3;
-; CHECK-NEXT:    cvt.rn.f16.f32 %rs4, %f4;
-; CHECK-NEXT:    mov.b32 %r2, {%rs4, %rs3};
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    cvt.f32.f16 %r2, %rs2;
+; CHECK-NEXT:    lg2.approx.f32 %r3, %r2;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs3, %r3;
+; CHECK-NEXT:    cvt.f32.f16 %r4, %rs1;
+; CHECK-NEXT:    lg2.approx.f32 %r5, %r4;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs4, %r5;
+; CHECK-NEXT:    mov.b32 %r6, {%rs4, %rs3};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
 ; CHECK-NEXT:    ret;
 entry:
   %log2 = call <2 x half> @llvm.log2.v2f16(<2 x half> %in)
@@ -125,22 +124,19 @@ define bfloat @log2_bf16_test(bfloat %in) {
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<2>;
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
-; CHECK-NEXT:    .reg .b32 %r<9>;
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<13>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    ld.param.b16 %r1, [log2_bf16_test_param_0];
-; CHECK-NEXT:    shl.b32 %r2, %r1, 16;
-; CHECK-NEXT:    mov.b32 %f1, %r2;
-; CHECK-NEXT:    lg2.approx.f32 %f2, %f1;
-; CHECK-NEXT:    mov.b32 %r3, %f2;
-; CHECK-NEXT:    bfe.u32 %r4, %r3, 16, 1;
-; CHECK-NEXT:    add.s32 %r5, %r4, %r3;
-; CHECK-NEXT:    add.s32 %r6, %r5, 32767;
-; CHECK-NEXT:    setp.nan.f32 %p1, %f2, %f2;
-; CHECK-NEXT:    or.b32 %r7, %r3, 4194304;
-; CHECK-NEXT:    selp.b32 %r8, %r7, %r6, %p1;
-; CHECK-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r8; }
+; CHECK-NEXT:    shl.b32 %r11, %r1, 16;
+; CHECK-NEXT:    lg2.approx.f32 %r12, %r11;
+; CHECK-NEXT:    bfe.u32 %r6, %r12, 16, 1;
+; CHECK-NEXT:    add.s32 %r7, %r6, %r12;
+; CHECK-NEXT:    add.s32 %r8, %r7, 32767;
+; CHECK-NEXT:    setp.nan.f32 %p1, %r12, %r12;
+; CHECK-NEXT:    or.b32 %r9, %r12, 4194304;
+; CHECK-NEXT:    selp.b32 %r10, %r9, %r8, %p1;
+; CHECK-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r10; }
 ; CHECK-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; CHECK-NEXT:    ret;
 entry:
@@ -154,22 +150,19 @@ define bfloat @log2_bf16_ftz_test(bfloat %in) #0 {
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<2>;
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
-; CHECK-NEXT:    .reg .b32 %r<9>;
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<13>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    ld.param.b16 %r1, [log2_bf16_ftz_test_param_0];
-; CHECK-NEXT:    shl.b32 %r2, %r1, 16;
-; CHECK-NEXT:    mov.b32 %f1, %r2;
-; CHECK-NEXT:    lg2.approx.ftz.f32 %f2, %f1;
-; CHECK-NEXT:    mov.b32 %r3, %f2;
-; CHECK-NEXT:    bfe.u32 %r4, %r3, 16, 1;
-; CHECK-NEXT:    add.s32 %r5, %r4, %r3;
-; CHECK-NEXT:    add.s32 %r6, %r5, 32767;
-; CHECK-NEXT:    setp.nan.ftz.f32 %p1, %f2, %f2;
-; CHECK-NEXT:    or.b32 %r7, %r3, 4194304;
-; CHECK-NEXT:    selp.b32 %r8, %r7, %r6, %p1;
-; CHECK-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r8; }
+; CHECK-NEXT:    shl.b32 %r11, %r1, 16;
+; CHECK-NEXT:    lg2.approx.ftz.f32 %r12, %r11;
+; CHECK-NEXT:    bfe.u32 %r6, %r12, 16, 1;
+; CHECK-NEXT:    add.s32 %r7, %r6, %r12;
+; CHECK-NEXT:    add.s32 %r8, %r7, 32767;
+; CHECK-NEXT:    setp.nan.ftz.f32 %p1, %r12, %r12;
+; CHECK-NEXT:    or.b32 %r9, %r12, 4194304;
+; CHECK-NEXT:    selp.b32 %r10, %r9, %r8, %p1;
+; CHECK-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r10; }
 ; CHECK-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; CHECK-NEXT:    ret;
 entry:
@@ -183,36 +176,31 @@ define <2 x bfloat> @log2_bf16_test_v(<2 x bfloat> %in) {
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<3>;
 ; CHECK-NEXT:    .reg .b16 %rs<3>;
-; CHECK-NEXT:    .reg .b32 %r<19>;
-; CHECK-NEXT:    .reg .b32 %f<5>;
+; CHECK-NEXT:    .reg .b32 %r<27>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    ld.param.b32 %r1, [log2_bf16_test_v_param_0];
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    cvt.u32.u16 %r2, %rs2;
-; CHECK-NEXT:    shl.b32 %r3, %r2, 16;
-; CHECK-NEXT:    mov.b32 %f1, %r3;
-; CHECK-NEXT:    lg2.approx.f32 %f2, %f1;
-; CHECK-NEXT:    mov.b32 %r4, %f2;
-; CHECK-NEXT:    bfe.u32 %r5, %r4, 16, 1;
-; CHECK-NEXT:    add.s32 %r6, %r5, %r4;
-; CHECK-NEXT:    add.s32 %r7, %r6, 32767;
-; CHECK-NEXT:    setp.nan.f32 %p1, %f2, %f2;
-; CHECK-NEXT:    or.b32 %r8, %r4, 4194304;
-; CHECK-NEXT:    selp.b32 %r9, %r8, %r7, %p1;
-; CHECK-NEXT:    cvt.u32.u16 %r10, %rs1;
-; CHECK-NEXT:    shl.b32 %r11, %r10, 16;
-; CHECK-NEXT:    mov.b32 %f3, %r11;
-; CHECK-NEXT:    lg2.approx.f32 %f4, %f3;
-; CHECK-NEXT:    mov.b32 %r12, %f4;
-; CHECK-NEXT:    bfe.u32 %r13, %r12, 16, 1;
-; CHECK-NEXT:    add.s32 %r14, %r13, %r12;
-; CHECK-NEXT:    add.s32 %r15, %r14, 32767;
-; CHECK-NEXT:    setp.nan.f32 %p2, %f4, %f4;
-; CHECK-NEXT:    or.b32 %r16, %r12, 4194304;
-; CHECK-NEXT:    selp.b32 %r17, %r16, %r15, %p2;
-; CHECK-NEXT:    prmt.b32 %r18, %r17, %r9, 0x7632U;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r18;
+; CHECK-NEXT:    shl.b32 %r23, %r2, 16;
+; CHECK-NEXT:    lg2.approx.f32 %r24, %r23;
+; CHECK-NEXT:    bfe.u32 %r7, %r24, 16, 1;
+; CHECK-NEXT:    add.s32 %r8, %r7, %r24;
+; CHECK-NEXT:    add.s32 %r9, %r8, 32767;
+; CHECK-NEXT:    setp.nan.f32 %p1, %r24, %r24;
+; CHECK-NEXT:    or.b32 %r10, %r24, 4194304;
+; CHECK-NEXT:    selp.b32 %r11, %r10, %r9, %p1;
+; CHECK-NEXT:    cvt.u32.u16 %r12, %rs1;
+; CHECK-NEXT:    shl.b32 %r25, %r12, 16;
+; CHECK-NEXT:    lg2.approx.f32 %r26, %r25;
+; CHECK-NEXT:    bfe.u32 %r17, %r26, 16, 1;
+; CHECK-NEXT:    add.s32 %r18, %r17, %r26;
+; CHECK-NEXT:    add.s32 %r19, %r18, 32767;
+; CHECK-NEXT:    setp.nan.f32 %p2, %r26, %r26;
+; CHECK-NEXT:    or.b32 %r20, %r26, 4194304;
+; CHECK-NEXT:    selp.b32 %r21, %r20, %r19, %p2;
+; CHECK-NEXT:    prmt.b32 %r22, %r21, %r11, 0x7632U;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r22;
 ; CHECK-NEXT:    ret;
 entry:
   %log2 = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> %in)
diff --git a/llvm/test/CodeGen/NVPTX/fma-assoc.ll b/llvm/test/CodeGen/NVPTX/fma-assoc.ll
index 47dc3f004a70b..1034c3eed3dc0 100644
--- a/llvm/test/CodeGen/NVPTX/fma-assoc.ll
+++ b/llvm/test/CodeGen/NVPTX/fma-assoc.ll
@@ -1,13 +1,25 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -fp-contract=fast | FileCheck %s -check-prefix=CHECK
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -fp-contract=fast -enable-unsafe-fp-math | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-UNSAFE
 ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -fp-contract=fast | %ptxas-verify %}
 ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -fp-contract=fast -enable-unsafe-fp-math | %ptxas-verify %}
 
 define ptx_device float @t1_f32(float %x, float %y, float %z,
+; CHECK-UNSAFE-LABEL: t1_f32(
+; CHECK-UNSAFE:       {
+; CHECK-UNSAFE-NEXT:    .reg .b32 %r<8>;
+; CHECK-UNSAFE-EMPTY:
+; CHECK-UNSAFE-NEXT:  // %bb.0:
+; CHECK-UNSAFE-NEXT:    ld.param.b32 %r1, [t1_f32_param_0];
+; CHECK-UNSAFE-NEXT:    ld.param.b32 %r2, [t1_f32_param_1];
+; CHECK-UNSAFE-NEXT:    ld.param.b32 %r3, [t1_f32_param_2];
+; CHECK-UNSAFE-NEXT:    ld.param.b32 %r4, [t1_f32_param_3];
+; CHECK-UNSAFE-NEXT:    ld.param.b32 %r5, [t1_f32_param_4];
+; CHECK-UNSAFE-NEXT:    fma.rn.f32 %r6, %r4, %r5, %r3;
+; CHECK-UNSAFE-NEXT:    fma.rn.f32 %r7, %r1, %r2, %r6;
+; CHECK-UNSAFE-NEXT:    st.param.b32 [func_retval0], %r7;
+; CHECK-UNSAFE-NEXT:    ret;
                                 float %u, float %v) {
-; CHECK-UNSAFE: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
-; CHECK-UNSAFE: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
-; CHECK: ret;
   %a = fmul float %x, %y
   %b = fmul float %u, %v
   %c = fadd float %a, %b
@@ -16,10 +28,21 @@ define ptx_device float @t1_f32(float %x, float %y, float %z,
 }
 
 define ptx_device double @t1_f64(double %x, double %y, double %z,
+; CHECK-UNSAFE-LABEL: t1_f64(
+; CHECK-UNSAFE:       {
+; CHECK-UNSAFE-NEXT:    .reg .b64 %rd<8>;
+; CHECK-UNSAFE-EMPTY:
+; CHECK-UNSAFE-NEXT:  // %bb.0:
+; CHECK-UNSAFE-NEXT:    ld.param.b64 %rd1, [t1_f64_param_0];
+; CHECK-UNSAFE-NEXT:    ld.param.b64 %rd2, [t1_f64_param_1];
+; CHECK-UNSAFE-NEXT:    ld.param.b64 %rd3, [t1_f64_param_2];
+; CHECK-UNSAFE-NEXT:    ld.param.b64 %rd4, [t1_f64_param_3];
+; CHECK-UNSAFE-NEXT:    ld.param.b64 %rd5, [t1_f64_param_4];
+; CHECK-UNSAFE-NEXT:    fma.rn.f64 %rd6, %rd4, %rd5, %rd3;
+; CHECK-UNSAFE-NEXT:    fma.rn.f64 %rd7, %rd1, %rd2, %rd6;
+; CHECK-UNSAFE-NEXT:    st.param.b64 [func_retval0], %rd7;
+; CHECK-UNSAFE-NEXT:    ret;
                                  double %u, double %v) {
-; CHECK-UNSAFE: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}};
-; CHECK-UNSAFE: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}};
-; CHECK: ret;
   %a = fmul double %x, %y
   %b = fmul double %u, %v
   %c = fadd double %a, %b
@@ -29,9 +52,16 @@ define ptx_device double @t1_f64(double %x, double %y, double %z,
 
 define double @two_choices(double %val1, double %val2) {
 ; CHECK-LABEL: two_choices(
-; CHECK: mul.f64
-; CHECK-NOT: mul.f64
-; CHECK: fma.rn.f64
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [two_choices_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [two_choices_param_1];
+; CHECK-NEXT:    mul.f64 %rd3, %rd1, %rd2;
+; CHECK-NEXT:    fma.rn.f64 %rd4, %rd3, %rd3, %rd3;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd4;
+; CHECK-NEXT:    ret;
   %1 = fmul double %val1, %val2
   %2 = fmul double %1, %1
   %3 = fadd double %1, %2
diff --git a/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll b/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll
index b971d2f237b40..c44512281f7be 100644
--- a/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll
+++ b/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll
@@ -198,16 +198,16 @@ define half @fma_f16_expanded_maxnum_no_nans(half %a, half %b, half %c) #0 {
 ; CHECK-SM70-LABEL: fma_f16_expanded_maxnum_no_nans(
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .b16 %rs<6>;
-; CHECK-SM70-NEXT:    .reg .b32 %f<3>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<3>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
 ; CHECK-SM70-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_maxnum_no_nans_param_0];
 ; CHECK-SM70-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_maxnum_no_nans_param_1];
 ; CHECK-SM70-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_maxnum_no_nans_param_2];
 ; CHECK-SM70-NEXT:    fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
-; CHECK-SM70-NEXT:    cvt.f32.f16 %f1, %rs4;
-; CHECK-SM70-NEXT:    max.f32 %f2, %f1, 0f00000000;
-; CHECK-SM70-NEXT:    cvt.rn.f16.f32 %rs5, %f2;
+; CHECK-SM70-NEXT:    cvt.f32.f16 %r1, %rs4;
+; CHECK-SM70-NEXT:    max.f32 %r2, %r1, 0f00000000;
+; CHECK-SM70-NEXT:    cvt.rn.f16.f32 %rs5, %r2;
 ; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs5;
 ; CHECK-SM70-NEXT:    ret;
   %1 = fmul half %a, %b
@@ -249,31 +249,25 @@ define bfloat @fma_bf16_expanded_unsafe_with_nans(bfloat %a, bfloat %b, bfloat %
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<3>;
 ; CHECK-SM70-NEXT:    .reg .b16 %rs<3>;
-; CHECK-SM70-NEXT:    .reg .b32 %r<14>;
-; CHECK-SM70-NEXT:    .reg .b32 %f<6>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<24>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
 ; CHECK-SM70-NEXT:    ld.param.b16 %r1, [fma_bf16_expanded_unsafe_with_nans_param_2];
-; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
-; CHECK-SM70-NEXT:    ld.param.b16 %r3, [fma_bf16_expanded_unsafe_with_nans_param_1];
-; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
-; CHECK-SM70-NEXT:    ld.param.b16 %r5, [fma_bf16_expanded_unsafe_with_nans_param_0];
-; CHECK-SM70-NEXT:    shl.b32 %r6, %r5, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f3, %r6;
-; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
-; CHECK-SM70-NEXT:    mov.b32 %r7, %f4;
-; CHECK-SM70-NEXT:    bfe.u32 %r8, %r7, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r9, %r8, %r7;
-; CHECK-SM70-NEXT:    add.s32 %r10, %r9, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
-; CHECK-SM70-NEXT:    or.b32 %r11, %r7, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r12; }
-; CHECK-SM70-NEXT:    and.b32 %r13, %r12, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f5, %r13;
-; CHECK-SM70-NEXT:    setp.gt.f32 %p2, %f5, 0f00000000;
+; CHECK-SM70-NEXT:    shl.b32 %r19, %r1, 16;
+; CHECK-SM70-NEXT:    ld.param.b16 %r4, [fma_bf16_expanded_unsafe_with_nans_param_1];
+; CHECK-SM70-NEXT:    shl.b32 %r20, %r4, 16;
+; CHECK-SM70-NEXT:    ld.param.b16 %r7, [fma_bf16_expanded_unsafe_with_nans_param_0];
+; CHECK-SM70-NEXT:    shl.b32 %r21, %r7, 16;
+; CHECK-SM70-NEXT:    fma.rn.f32 %r22, %r21, %r20, %r19;
+; CHECK-SM70-NEXT:    bfe.u32 %r12, %r22, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r13, %r12, %r22;
+; CHECK-SM70-NEXT:    add.s32 %r14, %r13, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %r22, %r22;
+; CHECK-SM70-NEXT:    or.b32 %r15, %r22, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r16, %r15, %r14, %p1;
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r16; }
+; CHECK-SM70-NEXT:    and.b32 %r23, %r16, -65536;
+; CHECK-SM70-NEXT:    setp.gt.f32 %p2, %r23, 0f00000000;
 ; CHECK-SM70-NEXT:    selp.b16 %rs2, %rs1, 0x0000, %p2;
 ; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs2;
 ; CHECK-SM70-NEXT:    ret;
@@ -313,31 +307,25 @@ define bfloat @fma_bf16_expanded_no_nans(bfloat %a, bfloat %b, bfloat %c) #0 {
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<3>;
 ; CHECK-SM70-NEXT:    .reg .b16 %rs<3>;
-; CHECK-SM70-NEXT:    .reg .b32 %r<14>;
-; CHECK-SM70-NEXT:    .reg .b32 %f<6>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<24>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
 ; CHECK-SM70-NEXT:    ld.param.b16 %r1, [fma_bf16_expanded_no_nans_param_2];
-; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
-; CHECK-SM70-NEXT:    ld.param.b16 %r3, [fma_bf16_expanded_no_nans_param_1];
-; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
-; CHECK-SM70-NEXT:    ld.param.b16 %r5, [fma_bf16_expanded_no_nans_param_0];
-; CHECK-SM70-NEXT:    shl.b32 %r6, %r5, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f3, %r6;
-; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
-; CHECK-SM70-NEXT:    mov.b32 %r7, %f4;
-; CHECK-SM70-NEXT:    bfe.u32 %r8, %r7, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r9, %r8, %r7;
-; CHECK-SM70-NEXT:    add.s32 %r10, %r9, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
-; CHECK-SM70-NEXT:    or.b32 %r11, %r7, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r12; }
-; CHECK-SM70-NEXT:    and.b32 %r13, %r12, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f5, %r13;
-; CHECK-SM70-NEXT:    setp.gt.f32 %p2, %f5, 0f00000000;
+; CHECK-SM70-NEXT:    shl.b32 %r19, %r1, 16;
+; CHECK-SM70-NEXT:    ld.param.b16 %r4, [fma_bf16_expanded_no_nans_param_1];
+; CHECK-SM70-NEXT:    shl.b32 %r20, %r4, 16;
+; CHECK-SM70-NEXT:    ld.param.b16 %r7, [fma_bf16_expanded_no_nans_param_0];
+; CHECK-SM70-NEXT:    shl.b32 %r21, %r7, 16;
+; CHECK-SM70-NEXT:    fma.rn.f32 %r22, %r21, %r20, %r19;
+; CHECK-SM70-NEXT:    bfe.u32 %r12, %r22, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r13, %r12, %r22;
+; CHECK-SM70-NEXT:    add.s32 %r14, %r13, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %r22, %r22;
+; CHECK-SM70-NEXT:    or.b32 %r15, %r22, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r16, %r15, %r14, %p1;
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r16; }
+; CHECK-SM70-NEXT:    and.b32 %r23, %r16, -65536;
+; CHECK-SM70-NEXT:    setp.gt.f32 %p2, %r23, 0f00000000;
 ; CHECK-SM70-NEXT:    selp.b16 %rs2, %rs1, 0x0000, %p2;
 ; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs2;
 ; CHECK-SM70-NEXT:    ret;
@@ -371,8 +359,7 @@ define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat
 ; CHECK-FTZ-LABEL: fma_bf16_expanded_no_nans_multiple_uses_of_fma(
 ; CHECK-FTZ:       {
 ; CHECK-FTZ-NEXT:    .reg .b16 %rs<9>;
-; CHECK-FTZ-NEXT:    .reg .b32 %r<7>;
-; CHECK-FTZ-NEXT:    .reg .b32 %f<6>;
+; CHECK-FTZ-NEXT:    .reg .b32 %r<15>;
 ; CHECK-FTZ-EMPTY:
 ; CHECK-FTZ-NEXT:  // %bb.0:
 ; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_0];
@@ -382,18 +369,15 @@ define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat
 ; CHECK-FTZ-NEXT:    mov.b16 %rs5, 0x0000;
 ; CHECK-FTZ-NEXT:    max.bf16 %rs6, %rs4, %rs5;
 ; CHECK-FTZ-NEXT:    cvt.u32.u16 %r1, %rs4;
-; CHECK-FTZ-NEXT:    shl.b32 %r2, %r1, 16;
-; CHECK-FTZ-NEXT:    mov.b32 %f1, %r2;
-; CHECK-FTZ-NEXT:    add.ftz.f32 %f2, %f1, 0f40E00000;
-; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs7, %f2;
-; CHECK-FTZ-NEXT:    cvt.u32.u16 %r3, %rs6;
-; CHECK-FTZ-NEXT:    shl.b32 %r4, %r3, 16;
-; CHECK-FTZ-NEXT:    mov.b32 %f3, %r4;
-; CHECK-FTZ-NEXT:    cvt.u32.u16 %r5, %rs7;
-; CHECK-FTZ-NEXT:    shl.b32 %r6, %r5, 16;
-; CHECK-FTZ-NEXT:    mov.b32 %f4, %r6;
-; CHECK-FTZ-NEXT:    add.ftz.f32 %f5, %f3, %f4;
-; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs8, %f5;
+; CHECK-FTZ-NEXT:    shl.b32 %r12, %r1, 16;
+; CHECK-FTZ-NEXT:    add.ftz.f32 %r4, %r12, 0f40E00000;
+; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs7, %r4;
+; CHECK-FTZ-NEXT:    cvt.u32.u16 %r5, %rs6;
+; CHECK-FTZ-NEXT:    shl.b32 %r13, %r5, 16;
+; CHECK-FTZ-NEXT:    cvt.u32.u16 %r8, %rs7;
+; CHECK-FTZ-NEXT:    shl.b32 %r14, %r8, 16;
+; CHECK-FTZ-NEXT:    add.ftz.f32 %r11, %r13, %r14;
+; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs8, %r11;
 ; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs8;
 ; CHECK-FTZ-NEXT:    ret;
 ;
@@ -401,54 +385,44 @@ define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<5>;
 ; CHECK-SM70-NEXT:    .reg .b16 %rs<4>;
-; CHECK-SM70-NEXT:    .reg .b32 %r<29>;
-; CHECK-SM70-NEXT:    .reg .b32 %f<10>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<47>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
 ; CHECK-SM70-NEXT:    ld.param.b16 %r1, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_2];
-; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
-; CHECK-SM70-NEXT:    ld.param.b16 %r3, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_1];
-; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
-; CHECK-SM70-NEXT:    ld.param.b16 %r5, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_0];
-; CHECK-SM70-NEXT:    shl.b32 %r6, %r5, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f3, %r6;
-; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
-; CHECK-SM70-NEXT:    mov.b32 %r7, %f4;
-; CHECK-SM70-NEXT:    bfe.u32 %r8, %r7, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r9, %r8, %r7;
-; CHECK-SM70-NEXT:    add.s32 %r10, %r9, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
-; CHECK-SM70-NEXT:    or.b32 %r11, %r7, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r12; }
-; CHECK-SM70-NEXT:    and.b32 %r13, %r12, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f5, %r13;
-; CHECK-SM70-NEXT:    setp.gt.f32 %p2, %f5, 0f00000000;
+; CHECK-SM70-NEXT:    shl.b32 %r38, %r1, 16;
+; CHECK-SM70-NEXT:    ld.param.b16 %r4, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_1];
+; CHECK-SM70-NEXT:    shl.b32 %r39, %r4, 16;
+; CHECK-SM70-NEXT:    ld.param.b16 %r7, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_0];
+; CHECK-SM70-NEXT:    shl.b32 %r40, %r7, 16;
+; CHECK-SM70-NEXT:    fma.rn.f32 %r41, %r40, %r39, %r38;
+; CHECK-SM70-NEXT:    bfe.u32 %r12, %r41, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r13, %r12, %r41;
+; CHECK-SM70-NEXT:    add.s32 %r14, %r13, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %r41, %r41;
+; CHECK-SM70-NEXT:    or.b32 %r15, %r41, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r16, %r15, %r14, %p1;
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r16; }
+; CHECK-SM70-NEXT:    and.b32 %r42, %r16, -65536;
+; CHECK-SM70-NEXT:    setp.gt.f32 %p2, %r42, 0f00000000;
 ; CHECK-SM70-NEXT:    selp.b16 %rs2, %rs1, 0x0000, %p2;
-; CHECK-SM70-NEXT:    add.f32 %f6, %f5, 0f40E00000;
-; CHECK-SM70-NEXT:    mov.b32 %r14, %f6;
-; CHECK-SM70-NEXT:    bfe.u32 %r15, %r14, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r16, %r15, %r14;
-; CHECK-SM70-NEXT:    add.s32 %r17, %r16, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p3, %f6, %f6;
-; CHECK-SM70-NEXT:    or.b32 %r18, %r14, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r19, %r18, %r17, %p3;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs2;
-; CHECK-SM70-NEXT:    shl.b32 %r21, %r20, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f7, %r21;
-; CHECK-SM70-NEXT:    and.b32 %r22, %r19, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f8, %r22;
-; CHECK-SM70-NEXT:    add.f32 %f9, %f7, %f8;
-; CHECK-SM70-NEXT:    mov.b32 %r23, %f9;
-; CHECK-SM70-NEXT:    bfe.u32 %r24, %r23, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r25, %r24, %r23;
-; CHECK-SM70-NEXT:    add.s32 %r26, %r25, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p4, %f9, %f9;
-; CHECK-SM70-NEXT:    or.b32 %r27, %r23, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r28, %r27, %r26, %p4;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs3}, %r28; }
+; CHECK-SM70-NEXT:    add.f32 %r43, %r42, 0f40E00000;
+; CHECK-SM70-NEXT:    bfe.u32 %r21, %r43, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r22, %r21, %r43;
+; CHECK-SM70-NEXT:    add.s32 %r23, %r22, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p3, %r43, %r43;
+; CHECK-SM70-NEXT:    or.b32 %r24, %r43, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r25, %r24, %r23, %p3;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r26, %rs2;
+; CHECK-SM70-NEXT:    shl.b32 %r44, %r26, 16;
+; CHECK-SM70-NEXT:    and.b32 %r45, %r25, -65536;
+; CHECK-SM70-NEXT:    add.f32 %r46, %r44, %r45;
+; CHECK-SM70-NEXT:    bfe.u32 %r33, %r46, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r34, %r33, %r46;
+; CHECK-SM70-NEXT:    add.s32 %r35, %r34, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p4, %r46, %r46;
+; CHECK-SM70-NEXT:    or.b32 %r36, %r46, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r37, %r36, %r35, %p4;
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs3}, %r37; }
 ; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs3;
 ; CHECK-SM70-NEXT:    ret;
   %1 = fmul bfloat %a, %b
@@ -489,38 +463,31 @@ define bfloat @fma_bf16_expanded_maxnum_no_nans(bfloat %a, bfloat %b, bfloat %c)
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<3>;
 ; CHECK-SM70-NEXT:    .reg .b16 %rs<2>;
-; CHECK-SM70-NEXT:    .reg .b32 %r<20>;
-; CHECK-SM70-NEXT:    .reg .b32 %f<7>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<32>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
 ; CHECK-SM70-NEXT:    ld.param.b16 %r1, [fma_bf16_expanded_maxnum_no_nans_param_2];
-; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
-; CHECK-SM70-NEXT:    ld.param.b16 %r3, [fma_bf16_expanded_maxnum_no_nans_param_1];
-; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
-; CHECK-SM70-NEXT:    ld.param.b16 %r5, [fma_bf16_expanded_maxnum_no_nans_param_0];
-; CHECK-SM70-NEXT:    shl.b32 %r6, %r5, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f3, %r6;
-; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
-; CHECK-SM70-NEXT:    mov.b32 %r7, %f4;
-; CHECK-SM70-NEXT:    bfe.u32 %r8, %r7, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r9, %r8, %r7;
-; CHECK-SM70-NEXT:    add.s32 %r10, %r9, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
-; CHECK-SM70-NEXT:    or.b32 %r11, %r7, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
-; CHECK-SM70-NEXT:    and.b32 %r13, %r12, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f5, %r13;
-; CHECK-SM70-NEXT:    max.f32 %f6, %f5, 0f00000000;
-; CHECK-SM70-NEXT:    mov.b32 %r14, %f6;
-; CHECK-SM70-NEXT:    bfe.u32 %r15, %r14, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r16, %r15, %r14;
-; CHECK-SM70-NEXT:    add.s32 %r17, %r16, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f6, %f6;
-; CHECK-SM70-NEXT:    or.b32 %r18, %r14, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r19, %r18, %r17, %p2;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r19; }
+; CHECK-SM70-NEXT:    shl.b32 %r26, %r1, 16;
+; CHECK-SM70-NEXT:    ld.param.b16 %r4, [fma_bf16_expanded_maxnum_no_nans_param_1];
+; CHECK-SM70-NEXT:    shl.b32 %r27, %r4, 16;
+; CHECK-SM70-NEXT:    ld.param.b16 %r7, [fma_bf16_expanded_maxnum_no_nans_param_0];
+; CHECK-SM70-NEXT:    shl.b32 %r28, %r7, 16;
+; CHECK-SM70-NEXT:    fma.rn.f32 %r29, %r28, %r27, %r26;
+; CHECK-SM70-NEXT:    bfe.u32 %r12, %r29, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r13, %r12, %r29;
+; CHECK-SM70-NEXT:    add.s32 %r14, %r13, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %r29, %r29;
+; CHECK-SM70-NEXT:    or.b32 %r15, %r29, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r16, %r15, %r14, %p1;
+; CHECK-SM70-NEXT:    and.b32 %r30, %r16, -65536;
+; CHECK-SM70-NEXT:    max.f32 %r31, %r30, 0f00000000;
+; CHECK-SM70-NEXT:    bfe.u32 %r21, %r31, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r22, %r21, %r31;
+; CHECK-SM70-NEXT:    add.s32 %r23, %r22, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %r31, %r31;
+; CHECK-SM70-NEXT:    or.b32 %r24, %r31, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r25, %r24, %r23, %p2;
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r25; }
 ; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; CHECK-SM70-NEXT:    ret;
   %1 = fmul bfloat %a, %b
@@ -730,8 +697,7 @@ define <2 x half> @fma_f16x2_expanded_maxnum_no_nans(<2 x half> %a, <2 x half> %
 ; CHECK-SM70-LABEL: fma_f16x2_expanded_maxnum_no_nans(
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .b16 %rs<5>;
-; CHECK-SM70-NEXT:    .reg .b32 %r<6>;
-; CHECK-SM70-NEXT:    .reg .b32 %f<5>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<10>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
 ; CHECK-SM70-NEXT:    ld.param.b32 %r1, [fma_f16x2_expanded_maxnum_no_nans_param_2];
@@ -739,14 +705,14 @@ define <2 x half> @fma_f16x2_expanded_maxnum_no_nans(<2 x half> %a, <2 x half> %
 ; CHECK-SM70-NEXT:    ld.param.b32 %r3, [fma_f16x2_expanded_maxnum_no_nans_param_0];
 ; CHECK-SM70-NEXT:    fma.rn.f16x2 %r4, %r3, %r2, %r1;
 ; CHECK-SM70-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
-; CHECK-SM70-NEXT:    cvt.f32.f16 %f1, %rs2;
-; CHECK-SM70-NEXT:    max.f32 %f2, %f1, 0f00000000;
-; CHECK-SM70-NEXT:    cvt.rn.f16.f32 %rs3, %f2;
-; CHECK-SM70-NEXT:    cvt.f32.f16 %f3, %rs1;
-; CHECK-SM70-NEXT:    max.f32 %f4, %f3, 0f00000000;
-; CHECK-SM70-NEXT:    cvt.rn.f16.f32 %rs4, %f4;
-; CHECK-SM70-NEXT:    mov.b32 %r5, {%rs4, %rs3};
-; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r5;
+; CHECK-SM70-NEXT:    cvt.f32.f16 %r5, %rs2;
+; CHECK-SM70-NEXT:    max.f32 %r6, %r5, 0f00000000;
+; CHECK-SM70-NEXT:    cvt.rn.f16.f32 %rs3, %r6;
+; CHECK-SM70-NEXT:    cvt.f32.f16 %r7, %rs1;
+; CHECK-SM70-NEXT:    max.f32 %r8, %r7, 0f00000000;
+; CHECK-SM70-NEXT:    cvt.rn.f16.f32 %rs4, %r8;
+; CHECK-SM70-NEXT:    mov.b32 %r9, {%rs4, %rs3};
+; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r9;
 ; CHECK-SM70-NEXT:    ret;
   %1 = fmul <2 x half> %a, %b
   %2 = fadd <2 x half> %1, %c
@@ -787,8 +753,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_unsafe_with_nans(<2 x bfloat> %a, <2 x
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<5>;
 ; CHECK-SM70-NEXT:    .reg .b16 %rs<11>;
-; CHECK-SM70-NEXT:    .reg .b32 %r<31>;
-; CHECK-SM70-NEXT:    .reg .b32 %f<11>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<51>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
 ; CHECK-SM70-NEXT:    ld.param.b32 %r1, [fma_bf16x2_expanded_unsafe_with_nans_param_0];
@@ -796,53 +761,43 @@ define <2 x bfloat> @fma_bf16x2_expanded_unsafe_with_nans(<2 x bfloat> %a, <2 x
 ; CHECK-SM70-NEXT:    ld.param.b32 %r3, [fma_bf16x2_expanded_unsafe_with_nans_param_2];
 ; CHECK-SM70-NEXT:    mov.b32 {%rs1, %rs2}, %r3;
 ; CHECK-SM70-NEXT:    cvt.u32.u16 %r4, %rs1;
-; CHECK-SM70-NEXT:    shl.b32 %r5, %r4, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f1, %r5;
+; CHECK-SM70-NEXT:    shl.b32 %r41, %r4, 16;
 ; CHECK-SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r6, %rs3;
-; CHECK-SM70-NEXT:    shl.b32 %r7, %r6, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f2, %r7;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r7, %rs3;
+; CHECK-SM70-NEXT:    shl.b32 %r42, %r7, 16;
 ; CHECK-SM70-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r8, %rs5;
-; CHECK-SM70-NEXT:    shl.b32 %r9, %r8, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f3, %r9;
-; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
-; CHECK-SM70-NEXT:    mov.b32 %r10, %f4;
-; CHECK-SM70-NEXT:    bfe.u32 %r11, %r10, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r12, %r11, %r10;
-; CHECK-SM70-NEXT:    add.s32 %r13, %r12, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
-; CHECK-SM70-NEXT:    or.b32 %r14, %r10, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r15, %r14, %r13, %p1;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r15; }
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r16, %rs2;
-; CHECK-SM70-NEXT:    shl.b32 %r17, %r16, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f5, %r17;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r18, %rs4;
-; CHECK-SM70-NEXT:    shl.b32 %r19, %r18, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f6, %r19;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs6;
-; CHECK-SM70-NEXT:    shl.b32 %r21, %r20, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f7, %r21;
-; CHECK-SM70-NEXT:    fma.rn.f32 %f8, %f7, %f6, %f5;
-; CHECK-SM70-NEXT:    mov.b32 %r22, %f8;
-; CHECK-SM70-NEXT:    bfe.u32 %r23, %r22, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r24, %r23, %r22;
-; CHECK-SM70-NEXT:    add.s32 %r25, %r24, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f8, %f8;
-; CHECK-SM70-NEXT:    or.b32 %r26, %r22, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r27, %r26, %r25, %p2;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r27; }
-; CHECK-SM70-NEXT:    and.b32 %r28, %r15, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f9, %r28;
-; CHECK-SM70-NEXT:    setp.gt.f32 %p3, %f9, 0f00000000;
-; CHECK-SM70-NEXT:    and.b32 %r29, %r27, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f10, %r29;
-; CHECK-SM70-NEXT:    setp.gt.f32 %p4, %f10, 0f00000000;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r10, %rs5;
+; CHECK-SM70-NEXT:    shl.b32 %r43, %r10, 16;
+; CHECK-SM70-NEXT:    fma.rn.f32 %r44, %r43, %r42, %r41;
+; CHECK-SM70-NEXT:    bfe.u32 %r15, %r44, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r16, %r15, %r44;
+; CHECK-SM70-NEXT:    add.s32 %r17, %r16, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %r44, %r44;
+; CHECK-SM70-NEXT:    or.b32 %r18, %r44, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r19, %r18, %r17, %p1;
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r19; }
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs2;
+; CHECK-SM70-NEXT:    shl.b32 %r45, %r20, 16;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r23, %rs4;
+; CHECK-SM70-NEXT:    shl.b32 %r46, %r23, 16;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r26, %rs6;
+; CHECK-SM70-NEXT:    shl.b32 %r47, %r26, 16;
+; CHECK-SM70-NEXT:    fma.rn.f32 %r48, %r47, %r46, %r45;
+; CHECK-SM70-NEXT:    bfe.u32 %r31, %r48, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r32, %r31, %r48;
+; CHECK-SM70-NEXT:    add.s32 %r33, %r32, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %r48, %r48;
+; CHECK-SM70-NEXT:    or.b32 %r34, %r48, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r35, %r34, %r33, %p2;
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r35; }
+; CHECK-SM70-NEXT:    and.b32 %r49, %r19, -65536;
+; CHECK-SM70-NEXT:    setp.gt.f32 %p3, %r49, 0f00000000;
+; CHECK-SM70-NEXT:    and.b32 %r50, %r35, -65536;
+; CHECK-SM70-NEXT:    setp.gt.f32 %p4, %r50, 0f00000000;
 ; CHECK-SM70-NEXT:    selp.b16 %rs9, %rs8, 0x0000, %p4;
 ; CHECK-SM70-NEXT:    selp.b16 %rs10, %rs7, 0x0000, %p3;
-; CHECK-SM70-NEXT:    mov.b32 %r30, {%rs10, %rs9};
-; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r30;
+; CHECK-SM70-NEXT:    mov.b32 %r40, {%rs10, %rs9};
+; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r40;
 ; CHECK-SM70-NEXT:    ret;
   %1 = fmul <2 x bfloat> %a, %b
   %2 = fadd <2 x bfloat> %1, %c
@@ -880,8 +835,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans(<2 x bfloat> %a, <2 x bfloat> %
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<5>;
 ; CHECK-SM70-NEXT:    .reg .b16 %rs<11>;
-; CHECK-SM70-NEXT:    .reg .b32 %r<31>;
-; CHECK-SM70-NEXT:    .reg .b32 %f<11>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<51>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
 ; CHECK-SM70-NEXT:    ld.param.b32 %r1, [fma_bf16x2_expanded_no_nans_param_0];
@@ -889,53 +843,43 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans(<2 x bfloat> %a, <2 x bfloat> %
 ; CHECK-SM70-NEXT:    ld.param.b32 %r3, [fma_bf16x2_expanded_no_nans_param_2];
 ; CHECK-SM70-NEXT:    mov.b32 {%rs1, %rs2}, %r3;
 ; CHECK-SM70-NEXT:    cvt.u32.u16 %r4, %rs1;
-; CHECK-SM70-NEXT:    shl.b32 %r5, %r4, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f1, %r5;
+; CHECK-SM70-NEXT:    shl.b32 %r41, %r4, 16;
 ; CHECK-SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r6, %rs3;
-; CHECK-SM70-NEXT:    shl.b32 %r7, %r6, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f2, %r7;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r7, %rs3;
+; CHECK-SM70-NEXT:    shl.b32 %r42, %r7, 16;
 ; CHECK-SM70-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r8, %rs5;
-; CHECK-SM70-NEXT:    shl.b32 %r9, %r8, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f3, %r9;
-; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
-; CHECK-SM70-NEXT:    mov.b32 %r10, %f4;
-; CHECK-SM70-NEXT:    bfe.u32 %r11, %r10, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r12, %r11, %r10;
-; CHECK-SM70-NEXT:    add.s32 %r13, %r12, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
-; CHECK-SM70-NEXT:    or.b32 %r14, %r10, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r15, %r14, %r13, %p1;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r15; }
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r16, %rs2;
-; CHECK-SM70-NEXT:    shl.b32 %r17, %r16, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f5, %r17;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r18, %rs4;
-; CHECK-SM70-NEXT:    shl.b32 %r19, %r18, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f6, %r19;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs6;
-; CHECK-SM70-NEXT:    shl.b32 %r21, %r20, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f7, %r21;
-; CHECK-SM70-NEXT:    fma.rn.f32 %f8, %f7, %f6, %f5;
-; CHECK-SM70-NEXT:    mov.b32 %r22, %f8;
-; CHECK-SM70-NEXT:    bfe.u32 %r23, %r22, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r24, %r23, %r22;
-; CHECK-SM70-NEXT:    add.s32 %r25, %r24, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f8, %f8;
-; CHECK-SM70-NEXT:    or.b32 %r26, %r22, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r27, %r26, %r25, %p2;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r27; }
-; CHECK-SM70-NEXT:    and.b32 %r28, %r15, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f9, %r28;
-; CHECK-SM70-NEXT:    setp.gt.f32 %p3, %f9, 0f00000000;
-; CHECK-SM70-NEXT:    and.b32 %r29, %r27, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f10, %r29;
-; CHECK-SM70-NEXT:    setp.gt.f32 %p4, %f10, 0f00000000;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r10, %rs5;
+; CHECK-SM70-NEXT:    shl.b32 %r43, %r10, 16;
+; CHECK-SM70-NEXT:    fma.rn.f32 %r44, %r43, %r42, %r41;
+; CHECK-SM70-NEXT:    bfe.u32 %r15, %r44, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r16, %r15, %r44;
+; CHECK-SM70-NEXT:    add.s32 %r17, %r16, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %r44, %r44;
+; CHECK-SM70-NEXT:    or.b32 %r18, %r44, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r19, %r18, %r17, %p1;
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r19; }
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs2;
+; CHECK-SM70-NEXT:    shl.b32 %r45, %r20, 16;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r23, %rs4;
+; CHECK-SM70-NEXT:    shl.b32 %r46, %r23, 16;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r26, %rs6;
+; CHECK-SM70-NEXT:    shl.b32 %r47, %r26, 16;
+; CHECK-SM70-NEXT:    fma.rn.f32 %r48, %r47, %r46, %r45;
+; CHECK-SM70-NEXT:    bfe.u32 %r31, %r48, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r32, %r31, %r48;
+; CHECK-SM70-NEXT:    add.s32 %r33, %r32, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %r48, %r48;
+; CHECK-SM70-NEXT:    or.b32 %r34, %r48, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r35, %r34, %r33, %p2;
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r35; }
+; CHECK-SM70-NEXT:    and.b32 %r49, %r19, -65536;
+; CHECK-SM70-NEXT:    setp.gt.f32 %p3, %r49, 0f00000000;
+; CHECK-SM70-NEXT:    and.b32 %r50, %r35, -65536;
+; CHECK-SM70-NEXT:    setp.gt.f32 %p4, %r50, 0f00000000;
 ; CHECK-SM70-NEXT:    selp.b16 %rs9, %rs8, 0x0000, %p4;
 ; CHECK-SM70-NEXT:    selp.b16 %rs10, %rs7, 0x0000, %p3;
-; CHECK-SM70-NEXT:    mov.b32 %r30, {%rs10, %rs9};
-; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r30;
+; CHECK-SM70-NEXT:    mov.b32 %r40, {%rs10, %rs9};
+; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r40;
 ; CHECK-SM70-NEXT:    ret;
   %1 = fmul <2 x bfloat> %a, %b
   %2 = fadd <2 x bfloat> %1, %c
@@ -967,8 +911,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa
 ; CHECK-FTZ-LABEL: fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(
 ; CHECK-FTZ:       {
 ; CHECK-FTZ-NEXT:    .reg .b16 %rs<7>;
-; CHECK-FTZ-NEXT:    .reg .b32 %r<20>;
-; CHECK-FTZ-NEXT:    .reg .b32 %f<11>;
+; CHECK-FTZ-NEXT:    .reg .b32 %r<36>;
 ; CHECK-FTZ-EMPTY:
 ; CHECK-FTZ-NEXT:  // %bb.0:
 ; CHECK-FTZ-NEXT:    ld.param.b32 %r1, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_2];
@@ -979,40 +922,33 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa
 ; CHECK-FTZ-NEXT:    max.bf16x2 %r6, %r4, %r5;
 ; CHECK-FTZ-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
 ; CHECK-FTZ-NEXT:    cvt.u32.u16 %r7, %rs2;
-; CHECK-FTZ-NEXT:    shl.b32 %r8, %r7, 16;
-; CHECK-FTZ-NEXT:    mov.b32 %f1, %r8;
-; CHECK-FTZ-NEXT:    add.ftz.f32 %f2, %f1, 0f40E00000;
-; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs3, %f2;
-; CHECK-FTZ-NEXT:    cvt.u32.u16 %r9, %rs1;
-; CHECK-FTZ-NEXT:    shl.b32 %r10, %r9, 16;
-; CHECK-FTZ-NEXT:    mov.b32 %f3, %r10;
-; CHECK-FTZ-NEXT:    add.ftz.f32 %f4, %f3, 0f40E00000;
-; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs4, %f4;
+; CHECK-FTZ-NEXT:    shl.b32 %r30, %r7, 16;
+; CHECK-FTZ-NEXT:    add.ftz.f32 %r10, %r30, 0f40E00000;
+; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs3, %r10;
+; CHECK-FTZ-NEXT:    cvt.u32.u16 %r11, %rs1;
+; CHECK-FTZ-NEXT:    shl.b32 %r31, %r11, 16;
+; CHECK-FTZ-NEXT:    add.ftz.f32 %r14, %r31, 0f40E00000;
+; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs4, %r14;
 ; CHECK-FTZ-NEXT:    mov.b32 {%rs5, %rs6}, %r6;
-; CHECK-FTZ-NEXT:    cvt.u32.u16 %r11, %rs5;
-; CHECK-FTZ-NEXT:    shl.b32 %r12, %r11, 16;
-; CHECK-FTZ-NEXT:    mov.b32 %f5, %r12;
-; CHECK-FTZ-NEXT:    cvt.u32.u16 %r13, %rs4;
-; CHECK-FTZ-NEXT:    shl.b32 %r14, %r13, 16;
-; CHECK-FTZ-NEXT:    mov.b32 %f6, %r14;
-; CHECK-FTZ-NEXT:    add.ftz.f32 %f7, %f5, %f6;
-; CHECK-FTZ-NEXT:    cvt.u32.u16 %r15, %rs6;
-; CHECK-FTZ-NEXT:    shl.b32 %r16, %r15, 16;
-; CHECK-FTZ-NEXT:    mov.b32 %f8, %r16;
-; CHECK-FTZ-NEXT:    cvt.u32.u16 %r17, %rs3;
-; CHECK-FTZ-NEXT:    shl.b32 %r18, %r17, 16;
-; CHECK-FTZ-NEXT:    mov.b32 %f9, %r18;
-; CHECK-FTZ-NEXT:    add.ftz.f32 %f10, %f8, %f9;
-; CHECK-FTZ-NEXT:    cvt.rn.bf16x2.f32 %r19, %f10, %f7;
-; CHECK-FTZ-NEXT:    st.param.b32 [func_retval0], %r19;
+; CHECK-FTZ-NEXT:    cvt.u32.u16 %r15, %rs5;
+; CHECK-FTZ-NEXT:    shl.b32 %r32, %r15, 16;
+; CHECK-FTZ-NEXT:    cvt.u32.u16 %r18, %rs4;
+; CHECK-FTZ-NEXT:    shl.b32 %r33, %r18, 16;
+; CHECK-FTZ-NEXT:    add.ftz.f32 %r21, %r32, %r33;
+; CHECK-FTZ-NEXT:    cvt.u32.u16 %r22, %rs6;
+; CHECK-FTZ-NEXT:    shl.b32 %r34, %r22, 16;
+; CHECK-FTZ-NEXT:    cvt.u32.u16 %r25, %rs3;
+; CHECK-FTZ-NEXT:    shl.b32 %r35, %r25, 16;
+; CHECK-FTZ-NEXT:    add.ftz.f32 %r28, %r34, %r35;
+; CHECK-FTZ-NEXT:    cvt.rn.bf16x2.f32 %r29, %r28, %r21;
+; CHECK-FTZ-NEXT:    st.param.b32 [func_retval0], %r29;
 ; CHECK-FTZ-NEXT:    ret;
 ;
 ; CHECK-SM70-LABEL: fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<9>;
 ; CHECK-SM70-NEXT:    .reg .b16 %rs<11>;
-; CHECK-SM70-NEXT:    .reg .b32 %r<61>;
-; CHECK-SM70-NEXT:    .reg .b32 %f<19>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<97>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
 ; CHECK-SM70-NEXT:    ld.param.b32 %r1, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_0];
@@ -1020,95 +956,77 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa
 ; CHECK-SM70-NEXT:    ld.param.b32 %r3, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_2];
 ; CHECK-SM70-NEXT:    mov.b32 {%rs1, %rs2}, %r3;
 ; CHECK-SM70-NEXT:    cvt.u32.u16 %r4, %rs2;
-; CHECK-SM70-NEXT:    shl.b32 %r5, %r4, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f1, %r5;
+; CHECK-SM70-NEXT:    shl.b32 %r79, %r4, 16;
 ; CHECK-SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r6, %rs4;
-; CHECK-SM70-NEXT:    shl.b32 %r7, %r6, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f2, %r7;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r7, %rs4;
+; CHECK-SM70-NEXT:    shl.b32 %r80, %r7, 16;
 ; CHECK-SM70-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r8, %rs6;
-; CHECK-SM70-NEXT:    shl.b32 %r9, %r8, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f3, %r9;
-; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
-; CHECK-SM70-NEXT:    mov.b32 %r10, %f4;
-; CHECK-SM70-NEXT:    bfe.u32 %r11, %r10, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r12, %r11, %r10;
-; CHECK-SM70-NEXT:    add.s32 %r13, %r12, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
-; CHECK-SM70-NEXT:    or.b32 %r14, %r10, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r15, %r14, %r13, %p1;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r15; }
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r16, %rs1;
-; CHECK-SM70-NEXT:    shl.b32 %r17, %r16, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f5, %r17;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r18, %rs3;
-; CHECK-SM70-NEXT:    shl.b32 %r19, %r18, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f6, %r19;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs5;
-; CHECK-SM70-NEXT:    shl.b32 %r21, %r20, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f7, %r21;
-; CHECK-SM70-NEXT:    fma.rn.f32 %f8, %f7, %f6, %f5;
-; CHECK-SM70-NEXT:    mov.b32 %r22, %f8;
-; CHECK-SM70-NEXT:    bfe.u32 %r23, %r22, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r24, %r23, %r22;
-; CHECK-SM70-NEXT:    add.s32 %r25, %r24, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f8, %f8;
-; CHECK-SM70-NEXT:    or.b32 %r26, %r22, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r27, %r26, %r25, %p2;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r27; }
-; CHECK-SM70-NEXT:    and.b32 %r28, %r15, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f9, %r28;
-; CHECK-SM70-NEXT:    setp.gt.f32 %p3, %f9, 0f00000000;
-; CHECK-SM70-NEXT:    and.b32 %r29, %r27, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f10, %r29;
-; CHECK-SM70-NEXT:    setp.gt.f32 %p4, %f10, 0f00000000;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r10, %rs6;
+; CHECK-SM70-NEXT:    shl.b32 %r81, %r10, 16;
+; CHECK-SM70-NEXT:    fma.rn.f32 %r82, %r81, %r80, %r79;
+; CHECK-SM70-NEXT:    bfe.u32 %r15, %r82, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r16, %r15, %r82;
+; CHECK-SM70-NEXT:    add.s32 %r17, %r16, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %r82, %r82;
+; CHECK-SM70-NEXT:    or.b32 %r18, %r82, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r19, %r18, %r17, %p1;
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r19; }
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs1;
+; CHECK-SM70-NEXT:    shl.b32 %r83, %r20, 16;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r23, %rs3;
+; CHECK-SM70-NEXT:    shl.b32 %r84, %r23, 16;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r26, %rs5;
+; CHECK-SM70-NEXT:    shl.b32 %r85, %r26, 16;
+; CHECK-SM70-NEXT:    fma.rn.f32 %r86, %r85, %r84, %r83;
+; CHECK-SM70-NEXT:    bfe.u32 %r31, %r86, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r32, %r31, %r86;
+; CHECK-SM70-NEXT:    add.s32 %r33, %r32, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %r86, %r86;
+; CHECK-SM70-NEXT:    or.b32 %r34, %r86, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r35, %r34, %r33, %p2;
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r35; }
+; CHECK-SM70-NEXT:    and.b32 %r87, %r19, -65536;
+; CHECK-SM70-NEXT:    setp.gt.f32 %p3, %r87, 0f00000000;
+; CHECK-SM70-NEXT:    and.b32 %r88, %r35, -65536;
+; CHECK-SM70-NEXT:    setp.gt.f32 %p4, %r88, 0f00000000;
 ; CHECK-SM70-NEXT:    selp.b16 %rs9, %rs8, 0x0000, %p4;
 ; CHECK-SM70-NEXT:    selp.b16 %rs10, %rs7, 0x0000, %p3;
-; CHECK-SM70-NEXT:    add.f32 %f11, %f10, 0f40E00000;
-; CHECK-SM70-NEXT:    mov.b32 %r30, %f11;
-; CHECK-SM70-NEXT:    bfe.u32 %r31, %r30, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r32, %r31, %r30;
-; CHECK-SM70-NEXT:    add.s32 %r33, %r32, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p5, %f11, %f11;
-; CHECK-SM70-NEXT:    or.b32 %r34, %r30, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r35, %r34, %r33, %p5;
-; CHECK-SM70-NEXT:    add.f32 %f12, %f9, 0f40E00000;
-; CHECK-SM70-NEXT:    mov.b32 %r36, %f12;
-; CHECK-SM70-NEXT:    bfe.u32 %r37, %r36, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r38, %r37, %r36;
-; CHECK-SM70-NEXT:    add.s32 %r39, %r38, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p6, %f12, %f12;
-; CHECK-SM70-NEXT:    or.b32 %r40, %r36, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r41, %r40, %r39, %p6;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r42, %rs10;
-; CHECK-SM70-NEXT:    shl.b32 %r43, %r42, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f13, %r43;
-; CHECK-SM70-NEXT:    and.b32 %r44, %r41, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f14, %r44;
-; CHECK-SM70-NEXT:    add.f32 %f15, %f13, %f14;
-; CHECK-SM70-NEXT:    mov.b32 %r45, %f15;
-; CHECK-SM70-NEXT:    bfe.u32 %r46, %r45, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r47, %r46, %r45;
-; CHECK-SM70-NEXT:    add.s32 %r48, %r47, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p7, %f15, %f15;
-; CHECK-SM70-NEXT:    or.b32 %r49, %r45, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r50, %r49, %r48, %p7;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r51, %rs9;
-; CHECK-SM70-NEXT:    shl.b32 %r52, %r51, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f16, %r52;
-; CHECK-SM70-NEXT:    and.b32 %r53, %r35, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f17, %r53;
-; CHECK-SM70-NEXT:    add.f32 %f18, %f16, %f17;
-; CHECK-SM70-NEXT:    mov.b32 %r54, %f18;
-; CHECK-SM70-NEXT:    bfe.u32 %r55, %r54, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r56, %r55, %r54;
-; CHECK-SM70-NEXT:    add.s32 %r57, %r56, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p8, %f18, %f18;
-; CHECK-SM70-NEXT:    or.b32 %r58, %r54, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r59, %r58, %r57, %p8;
-; CHECK-SM70-NEXT:    prmt.b32 %r60, %r59, %r50, 0x7632U;
-; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r60;
+; CHECK-SM70-NEXT:    add.f32 %r89, %r88, 0f40E00000;
+; CHECK-SM70-NEXT:    bfe.u32 %r42, %r89, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r43, %r42, %r89;
+; CHECK-SM70-NEXT:    add.s32 %r44, %r43, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p5, %r89, %r89;
+; CHECK-SM70-NEXT:    or.b32 %r45, %r89, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r46, %r45, %r44, %p5;
+; CHECK-SM70-NEXT:    add.f32 %r90, %r87, 0f40E00000;
+; CHECK-SM70-NEXT:    bfe.u32 %r49, %r90, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r50, %r49, %r90;
+; CHECK-SM70-NEXT:    add.s32 %r51, %r50, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p6, %r90, %r90;
+; CHECK-SM70-NEXT:    or.b32 %r52, %r90, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r53, %r52, %r51, %p6;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r54, %rs10;
+; CHECK-SM70-NEXT:    shl.b32 %r91, %r54, 16;
+; CHECK-SM70-NEXT:    and.b32 %r92, %r53, -65536;
+; CHECK-SM70-NEXT:    add.f32 %r93, %r91, %r92;
+; CHECK-SM70-NEXT:    bfe.u32 %r61, %r93, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r62, %r61, %r93;
+; CHECK-SM70-NEXT:    add.s32 %r63, %r62, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p7, %r93, %r93;
+; CHECK-SM70-NEXT:    or.b32 %r64, %r93, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r65, %r64, %r63, %p7;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r66, %rs9;
+; CHECK-SM70-NEXT:    shl.b32 %r94, %r66, 16;
+; CHECK-SM70-NEXT:    and.b32 %r95, %r46, -65536;
+; CHECK-SM70-NEXT:    add.f32 %r96, %r94, %r95;
+; CHECK-SM70-NEXT:    bfe.u32 %r73, %r96, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r74, %r73, %r96;
+; CHECK-SM70-NEXT:    add.s32 %r75, %r74, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p8, %r96, %r96;
+; CHECK-SM70-NEXT:    or.b32 %r76, %r96, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r77, %r76, %r75, %p8;
+; CHECK-SM70-NEXT:    prmt.b32 %r78, %r77, %r65, 0x7632U;
+; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r78;
 ; CHECK-SM70-NEXT:    ret;
   %1 = fmul <2 x bfloat> %a, %b
   %2 = fadd <2 x bfloat> %1, %c
@@ -1148,8 +1066,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_maxnum_no_nans(<2 x bfloat> %a, <2 x bf
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<5>;
 ; CHECK-SM70-NEXT:    .reg .b16 %rs<7>;
-; CHECK-SM70-NEXT:    .reg .b32 %r<43>;
-; CHECK-SM70-NEXT:    .reg .b32 %f<13>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<67>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
 ; CHECK-SM70-NEXT:    ld.param.b32 %r1, [fma_bf16x2_expanded_maxnum_no_nans_param_0];
@@ -1157,63 +1074,51 @@ define <2 x bfloat> @fma_bf16x2_expanded_maxnum_no_nans(<2 x bfloat> %a, <2 x bf
 ; CHECK-SM70-NEXT:    ld.param.b32 %r3, [fma_bf16x2_expanded_maxnum_no_nans_param_2];
 ; CHECK-SM70-NEXT:    mov.b32 {%rs1, %rs2}, %r3;
 ; CHECK-SM70-NEXT:    cvt.u32.u16 %r4, %rs1;
-; CHECK-SM70-NEXT:    shl.b32 %r5, %r4, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f1, %r5;
+; CHECK-SM70-NEXT:    shl.b32 %r55, %r4, 16;
 ; CHECK-SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r6, %rs3;
-; CHECK-SM70-NEXT:    shl.b32 %r7, %r6, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f2, %r7;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r7, %rs3;
+; CHECK-SM70-NEXT:    shl.b32 %r56, %r7, 16;
 ; CHECK-SM70-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r8, %rs5;
-; CHECK-SM70-NEXT:    shl.b32 %r9, %r8, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f3, %r9;
-; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
-; CHECK-SM70-NEXT:    mov.b32 %r10, %f4;
-; CHECK-SM70-NEXT:    bfe.u32 %r11, %r10, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r12, %r11, %r10;
-; CHECK-SM70-NEXT:    add.s32 %r13, %r12, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
-; CHECK-SM70-NEXT:    or.b32 %r14, %r10, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r15, %r14, %r13, %p1;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r16, %rs2;
-; CHECK-SM70-NEXT:    shl.b32 %r17, %r16, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f5, %r17;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r18, %rs4;
-; CHECK-SM70-NEXT:    shl.b32 %r19, %r18, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f6, %r19;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs6;
-; CHECK-SM70-NEXT:    shl.b32 %r21, %r20, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f7, %r21;
-; CHECK-SM70-NEXT:    fma.rn.f32 %f8, %f7, %f6, %f5;
-; CHECK-SM70-NEXT:    mov.b32 %r22, %f8;
-; CHECK-SM70-NEXT:    bfe.u32 %r23, %r22, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r24, %r23, %r22;
-; CHECK-SM70-NEXT:    add.s32 %r25, %r24, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f8, %f8;
-; CHECK-SM70-NEXT:    or.b32 %r26, %r22, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r27, %r26, %r25, %p2;
-; CHECK-SM70-NEXT:    and.b32 %r28, %r27, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f9, %r28;
-; CHECK-SM70-NEXT:    max.f32 %f10, %f9, 0f00000000;
-; CHECK-SM70-NEXT:    mov.b32 %r29, %f10;
-; CHECK-SM70-NEXT:    bfe.u32 %r30, %r29, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r31, %r30, %r29;
-; CHECK-SM70-NEXT:    add.s32 %r32, %r31, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p3, %f10, %f10;
-; CHECK-SM70-NEXT:    or.b32 %r33, %r29, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r34, %r33, %r32, %p3;
-; CHECK-SM70-NEXT:    and.b32 %r35, %r15, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f11, %r35;
-; CHECK-SM70-NEXT:    max.f32 %f12, %f11, 0f00000000;
-; CHECK-SM70-NEXT:    mov.b32 %r36, %f12;
-; CHECK-SM70-NEXT:    bfe.u32 %r37, %r36, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r38, %r37, %r36;
-; CHECK-SM70-NEXT:    add.s32 %r39, %r38, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p4, %f12, %f12;
-; CHECK-SM70-NEXT:    or.b32 %r40, %r36, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r41, %r40, %r39, %p4;
-; CHECK-SM70-NEXT:    prmt.b32 %r42, %r41, %r34, 0x7632U;
-; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r42;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r10, %rs5;
+; CHECK-SM70-NEXT:    shl.b32 %r57, %r10, 16;
+; CHECK-SM70-NEXT:    fma.rn.f32 %r58, %r57, %r56, %r55;
+; CHECK-SM70-NEXT:    bfe.u32 %r15, %r58, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r16, %r15, %r58;
+; CHECK-SM70-NEXT:    add.s32 %r17, %r16, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %r58, %r58;
+; CHECK-SM70-NEXT:    or.b32 %r18, %r58, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r19, %r18, %r17, %p1;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs2;
+; CHECK-SM70-NEXT:    shl.b32 %r59, %r20, 16;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r23, %rs4;
+; CHECK-SM70-NEXT:    shl.b32 %r60, %r23, 16;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r26, %rs6;
+; CHECK-SM70-NEXT:    shl.b32 %r61, %r26, 16;
+; CHECK-SM70-NEXT:    fma.rn.f32 %r62, %r61, %r60, %r59;
+; CHECK-SM70-NEXT:    bfe.u32 %r31, %r62, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r32, %r31, %r62;
+; CHECK-SM70-NEXT:    add.s32 %r33, %r32, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %r62, %r62;
+; CHECK-SM70-NEXT:    or.b32 %r34, %r62, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r35, %r34, %r33, %p2;
+; CHECK-SM70-NEXT:    and.b32 %r63, %r35, -65536;
+; CHECK-SM70-NEXT:    max.f32 %r64, %r63, 0f00000000;
+; CHECK-SM70-NEXT:    bfe.u32 %r40, %r64, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r41, %r40, %r64;
+; CHECK-SM70-NEXT:    add.s32 %r42, %r41, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p3, %r64, %r64;
+; CHECK-SM70-NEXT:    or.b32 %r43, %r64, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r44, %r43, %r42, %p3;
+; CHECK-SM70-NEXT:    and.b32 %r65, %r19, -65536;
+; CHECK-SM70-NEXT:    max.f32 %r66, %r65, 0f00000000;
+; CHECK-SM70-NEXT:    bfe.u32 %r49, %r66, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r50, %r49, %r66;
+; CHECK-SM70-NEXT:    add.s32 %r51, %r50, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p4, %r66, %r66;
+; CHECK-SM70-NEXT:    or.b32 %r52, %r66, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r53, %r52, %r51, %p4;
+; CHECK-SM70-NEXT:    prmt.b32 %r54, %r53, %r44, 0x7632U;
+; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r54;
 ; CHECK-SM70-NEXT:    ret;
   %1 = fmul <2 x bfloat> %a, %b
   %2 = fadd <2 x bfloat> %1, %c
diff --git a/llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll b/llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll
index d1081de000dba..ec12f3d44b5d0 100644
--- a/llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll
+++ b/llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll
@@ -137,16 +137,16 @@ define half @fma_f16_maxnum_no_nans(half %a, half %b, half %c) #0 {
 ; CHECK-SM70-LABEL: fma_f16_maxnum_no_nans(
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .b16 %rs<6>;
-; CHECK-SM70-NEXT:    .reg .b32 %f<3>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<3>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
 ; CHECK-SM70-NEXT:    ld.param.b16 %rs1, [fma_f16_maxnum_no_nans_param_0];
 ; CHECK-SM70-NEXT:    ld.param.b16 %rs2, [fma_f16_maxnum_no_nans_param_1];
 ; CHECK-SM70-NEXT:    ld.param.b16 %rs3, [fma_f16_maxnum_no_nans_param_2];
 ; CHECK-SM70-NEXT:    fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
-; CHECK-SM70-NEXT:    cvt.f32.f16 %f1, %rs4;
-; CHECK-SM70-NEXT:    max.f32 %f2, %f1, 0f00000000;
-; CHECK-SM70-NEXT:    cvt.rn.f16.f32 %rs5, %f2;
+; CHECK-SM70-NEXT:    cvt.f32.f16 %r1, %rs4;
+; CHECK-SM70-NEXT:    max.f32 %r2, %r1, 0f00000000;
+; CHECK-SM70-NEXT:    cvt.rn.f16.f32 %rs5, %r2;
 ; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs5;
 ; CHECK-SM70-NEXT:    ret;
   %1 = call half @llvm.fma.f16(half %a, half %b, half %c)
@@ -183,31 +183,25 @@ define bfloat @fma_bf16_no_nans(bfloat %a, bfloat %b, bfloat %c) #0 {
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<3>;
 ; CHECK-SM70-NEXT:    .reg .b16 %rs<3>;
-; CHECK-SM70-NEXT:    .reg .b32 %r<14>;
-; CHECK-SM70-NEXT:    .reg .b32 %f<6>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<24>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
 ; CHECK-SM70-NEXT:    ld.param.b16 %r1, [fma_bf16_no_nans_param_2];
-; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
-; CHECK-SM70-NEXT:    ld.param.b16 %r3, [fma_bf16_no_nans_param_1];
-; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
-; CHECK-SM70-NEXT:    ld.param.b16 %r5, [fma_bf16_no_nans_param_0];
-; CHECK-SM70-NEXT:    shl.b32 %r6, %r5, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f3, %r6;
-; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
-; CHECK-SM70-NEXT:    mov.b32 %r7, %f4;
-; CHECK-SM70-NEXT:    bfe.u32 %r8, %r7, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r9, %r8, %r7;
-; CHECK-SM70-NEXT:    add.s32 %r10, %r9, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
-; CHECK-SM70-NEXT:    or.b32 %r11, %r7, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r12; }
-; CHECK-SM70-NEXT:    and.b32 %r13, %r12, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f5, %r13;
-; CHECK-SM70-NEXT:    setp.gt.f32 %p2, %f5, 0f00000000;
+; CHECK-SM70-NEXT:    shl.b32 %r19, %r1, 16;
+; CHECK-SM70-NEXT:    ld.param.b16 %r4, [fma_bf16_no_nans_param_1];
+; CHECK-SM70-NEXT:    shl.b32 %r20, %r4, 16;
+; CHECK-SM70-NEXT:    ld.param.b16 %r7, [fma_bf16_no_nans_param_0];
+; CHECK-SM70-NEXT:    shl.b32 %r21, %r7, 16;
+; CHECK-SM70-NEXT:    fma.rn.f32 %r22, %r21, %r20, %r19;
+; CHECK-SM70-NEXT:    bfe.u32 %r12, %r22, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r13, %r12, %r22;
+; CHECK-SM70-NEXT:    add.s32 %r14, %r13, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %r22, %r22;
+; CHECK-SM70-NEXT:    or.b32 %r15, %r22, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r16, %r15, %r14, %p1;
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r16; }
+; CHECK-SM70-NEXT:    and.b32 %r23, %r16, -65536;
+; CHECK-SM70-NEXT:    setp.gt.f32 %p2, %r23, 0f00000000;
 ; CHECK-SM70-NEXT:    selp.b16 %rs2, %rs1, 0x0000, %p2;
 ; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs2;
 ; CHECK-SM70-NEXT:    ret;
@@ -238,8 +232,7 @@ define bfloat @fma_bf16_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloa
 ; CHECK-FTZ-LABEL: fma_bf16_no_nans_multiple_uses_of_fma(
 ; CHECK-FTZ:       {
 ; CHECK-FTZ-NEXT:    .reg .b16 %rs<7>;
-; CHECK-FTZ-NEXT:    .reg .b32 %r<5>;
-; CHECK-FTZ-NEXT:    .reg .b32 %f<5>;
+; CHECK-FTZ-NEXT:    .reg .b32 %r<11>;
 ; CHECK-FTZ-EMPTY:
 ; CHECK-FTZ-NEXT:  // %bb.0:
 ; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_bf16_no_nans_multiple_uses_of_fma_param_0];
@@ -247,15 +240,13 @@ define bfloat @fma_bf16_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloa
 ; CHECK-FTZ-NEXT:    ld.param.b16 %rs3, [fma_bf16_no_nans_multiple_uses_of_fma_param_2];
 ; CHECK-FTZ-NEXT:    fma.rn.bf16 %rs4, %rs1, %rs2, %rs3;
 ; CHECK-FTZ-NEXT:    cvt.u32.u16 %r1, %rs4;
-; CHECK-FTZ-NEXT:    shl.b32 %r2, %r1, 16;
-; CHECK-FTZ-NEXT:    mov.b32 %f1, %r2;
-; CHECK-FTZ-NEXT:    add.ftz.f32 %f2, %f1, 0f40E00000;
-; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs5, %f2;
-; CHECK-FTZ-NEXT:    cvt.u32.u16 %r3, %rs5;
-; CHECK-FTZ-NEXT:    shl.b32 %r4, %r3, 16;
-; CHECK-FTZ-NEXT:    mov.b32 %f3, %r4;
-; CHECK-FTZ-NEXT:    add.ftz.f32 %f4, %f3, %f1;
-; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs6, %f4;
+; CHECK-FTZ-NEXT:    shl.b32 %r9, %r1, 16;
+; CHECK-FTZ-NEXT:    add.ftz.f32 %r4, %r9, 0f40E00000;
+; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs5, %r4;
+; CHECK-FTZ-NEXT:    cvt.u32.u16 %r5, %rs5;
+; CHECK-FTZ-NEXT:    shl.b32 %r10, %r5, 16;
+; CHECK-FTZ-NEXT:    add.ftz.f32 %r8, %r10, %r9;
+; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs6, %r8;
 ; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs6;
 ; CHECK-FTZ-NEXT:    ret;
 ;
@@ -263,48 +254,39 @@ define bfloat @fma_bf16_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloa
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<4>;
 ; CHECK-SM70-NEXT:    .reg .b16 %rs<2>;
-; CHECK-SM70-NEXT:    .reg .b32 %r<27>;
-; CHECK-SM70-NEXT:    .reg .b32 %f<9>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<43>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
 ; CHECK-SM70-NEXT:    ld.param.b16 %r1, [fma_bf16_no_nans_multiple_uses_of_fma_param_2];
-; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
-; CHECK-SM70-NEXT:    ld.param.b16 %r3, [fma_bf16_no_nans_multiple_uses_of_fma_param_1];
-; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
-; CHECK-SM70-NEXT:    ld.param.b16 %r5, [fma_bf16_no_nans_multiple_uses_of_fma_param_0];
-; CHECK-SM70-NEXT:    shl.b32 %r6, %r5, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f3, %r6;
-; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
-; CHECK-SM70-NEXT:    mov.b32 %r7, %f4;
-; CHECK-SM70-NEXT:    bfe.u32 %r8, %r7, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r9, %r8, %r7;
-; CHECK-SM70-NEXT:    add.s32 %r10, %r9, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
-; CHECK-SM70-NEXT:    or.b32 %r11, %r7, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
-; CHECK-SM70-NEXT:    and.b32 %r13, %r12, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f5, %r13;
-; CHECK-SM70-NEXT:    add.f32 %f6, %f5, 0f40E00000;
-; CHECK-SM70-NEXT:    mov.b32 %r14, %f6;
-; CHECK-SM70-NEXT:    bfe.u32 %r15, %r14, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r16, %r15, %r14;
-; CHECK-SM70-NEXT:    add.s32 %r17, %r16, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f6, %f6;
-; CHECK-SM70-NEXT:    or.b32 %r18, %r14, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r19, %r18, %r17, %p2;
-; CHECK-SM70-NEXT:    and.b32 %r20, %r19, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f7, %r20;
-; CHECK-SM70-NEXT:    add.f32 %f8, %f7, %f5;
-; CHECK-SM70-NEXT:    mov.b32 %r21, %f8;
-; CHECK-SM70-NEXT:    bfe.u32 %r22, %r21, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r23, %r22, %r21;
-; CHECK-SM70-NEXT:    add.s32 %r24, %r23, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p3, %f8, %f8;
-; CHECK-SM70-NEXT:    or.b32 %r25, %r21, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r26, %r25, %r24, %p3;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r26; }
+; CHECK-SM70-NEXT:    shl.b32 %r35, %r1, 16;
+; CHECK-SM70-NEXT:    ld.param.b16 %r4, [fma_bf16_no_nans_multiple_uses_of_fma_param_1];
+; CHECK-SM70-NEXT:    shl.b32 %r36, %r4, 16;
+; CHECK-SM70-NEXT:    ld.param.b16 %r7, [fma_bf16_no_nans_multiple_uses_of_fma_param_0];
+; CHECK-SM70-NEXT:    shl.b32 %r37, %r7, 16;
+; CHECK-SM70-NEXT:    fma.rn.f32 %r38, %r37, %r36, %r35;
+; CHECK-SM70-NEXT:    bfe.u32 %r12, %r38, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r13, %r12, %r38;
+; CHECK-SM70-NEXT:    add.s32 %r14, %r13, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %r38, %r38;
+; CHECK-SM70-NEXT:    or.b32 %r15, %r38, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r16, %r15, %r14, %p1;
+; CHECK-SM70-NEXT:    and.b32 %r39, %r16, -65536;
+; CHECK-SM70-NEXT:    add.f32 %r40, %r39, 0f40E00000;
+; CHECK-SM70-NEXT:    bfe.u32 %r21, %r40, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r22, %r21, %r40;
+; CHECK-SM70-NEXT:    add.s32 %r23, %r22, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %r40, %r40;
+; CHECK-SM70-NEXT:    or.b32 %r24, %r40, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r25, %r24, %r23, %p2;
+; CHECK-SM70-NEXT:    and.b32 %r41, %r25, -65536;
+; CHECK-SM70-NEXT:    add.f32 %r42, %r41, %r39;
+; CHECK-SM70-NEXT:    bfe.u32 %r30, %r42, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r31, %r30, %r42;
+; CHECK-SM70-NEXT:    add.s32 %r32, %r31, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p3, %r42, %r42;
+; CHECK-SM70-NEXT:    or.b32 %r33, %r42, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r34, %r33, %r32, %p3;
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r34; }
 ; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; CHECK-SM70-NEXT:    ret;
   %1 = call bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c)
@@ -344,38 +326,31 @@ define bfloat @fma_bf16_maxnum_no_nans(bfloat %a, bfloat %b, bfloat %c) #0 {
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<3>;
 ; CHECK-SM70-NEXT:    .reg .b16 %rs<2>;
-; CHECK-SM70-NEXT:    .reg .b32 %r<20>;
-; CHECK-SM70-NEXT:    .reg .b32 %f<7>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<32>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
 ; CHECK-SM70-NEXT:    ld.param.b16 %r1, [fma_bf16_maxnum_no_nans_param_2];
-; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
-; CHECK-SM70-NEXT:    ld.param.b16 %r3, [fma_bf16_maxnum_no_nans_param_1];
-; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
-; CHECK-SM70-NEXT:    ld.param.b16 %r5, [fma_bf16_maxnum_no_nans_param_0];
-; CHECK-SM70-NEXT:    shl.b32 %r6, %r5, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f3, %r6;
-; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
-; CHECK-SM70-NEXT:    mov.b32 %r7, %f4;
-; CHECK-SM70-NEXT:    bfe.u32 %r8, %r7, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r9, %r8, %r7;
-; CHECK-SM70-NEXT:    add.s32 %r10, %r9, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
-; CHECK-SM70-NEXT:    or.b32 %r11, %r7, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
-; CHECK-SM70-NEXT:    and.b32 %r13, %r12, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f5, %r13;
-; CHECK-SM70-NEXT:    max.f32 %f6, %f5, 0f00000000;
-; CHECK-SM70-NEXT:    mov.b32 %r14, %f6;
-; CHECK-SM70-NEXT:    bfe.u32 %r15, %r14, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r16, %r15, %r14;
-; CHECK-SM70-NEXT:    add.s32 %r17, %r16, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f6, %f6;
-; CHECK-SM70-NEXT:    or.b32 %r18, %r14, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r19, %r18, %r17, %p2;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r19; }
+; CHECK-SM70-NEXT:    shl.b32 %r26, %r1, 16;
+; CHECK-SM70-NEXT:    ld.param.b16 %r4, [fma_bf16_maxnum_no_nans_param_1];
+; CHECK-SM70-NEXT:    shl.b32 %r27, %r4, 16;
+; CHECK-SM70-NEXT:    ld.param.b16 %r7, [fma_bf16_maxnum_no_nans_param_0];
+; CHECK-SM70-NEXT:    shl.b32 %r28, %r7, 16;
+; CHECK-SM70-NEXT:    fma.rn.f32 %r29, %r28, %r27, %r26;
+; CHECK-SM70-NEXT:    bfe.u32 %r12, %r29, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r13, %r12, %r29;
+; CHECK-SM70-NEXT:    add.s32 %r14, %r13, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %r29, %r29;
+; CHECK-SM70-NEXT:    or.b32 %r15, %r29, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r16, %r15, %r14, %p1;
+; CHECK-SM70-NEXT:    and.b32 %r30, %r16, -65536;
+; CHECK-SM70-NEXT:    max.f32 %r31, %r30, 0f00000000;
+; CHECK-SM70-NEXT:    bfe.u32 %r21, %r31, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r22, %r21, %r31;
+; CHECK-SM70-NEXT:    add.s32 %r23, %r22, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %r31, %r31;
+; CHECK-SM70-NEXT:    or.b32 %r24, %r31, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r25, %r24, %r23, %p2;
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r25; }
 ; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; CHECK-SM70-NEXT:    ret;
   %1 = call bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c)
@@ -515,8 +490,7 @@ define <2 x half> @fma_f16x2_maxnum_no_nans(<2 x half> %a, <2 x half> %b, <2 x h
 ; CHECK-SM70-LABEL: fma_f16x2_maxnum_no_nans(
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .b16 %rs<5>;
-; CHECK-SM70-NEXT:    .reg .b32 %r<6>;
-; CHECK-SM70-NEXT:    .reg .b32 %f<5>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<10>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
 ; CHECK-SM70-NEXT:    ld.param.b32 %r1, [fma_f16x2_maxnum_no_nans_param_2];
@@ -524,14 +498,14 @@ define <2 x half> @fma_f16x2_maxnum_no_nans(<2 x half> %a, <2 x half> %b, <2 x h
 ; CHECK-SM70-NEXT:    ld.param.b32 %r3, [fma_f16x2_maxnum_no_nans_param_0];
 ; CHECK-SM70-NEXT:    fma.rn.f16x2 %r4, %r3, %r2, %r1;
 ; CHECK-SM70-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
-; CHECK-SM70-NEXT:    cvt.f32.f16 %f1, %rs2;
-; CHECK-SM70-NEXT:    max.f32 %f2, %f1, 0f00000000;
-; CHECK-SM70-NEXT:    cvt.rn.f16.f32 %rs3, %f2;
-; CHECK-SM70-NEXT:    cvt.f32.f16 %f3, %rs1;
-; CHECK-SM70-NEXT:    max.f32 %f4, %f3, 0f00000000;
-; CHECK-SM70-NEXT:    cvt.rn.f16.f32 %rs4, %f4;
-; CHECK-SM70-NEXT:    mov.b32 %r5, {%rs4, %rs3};
-; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r5;
+; CHECK-SM70-NEXT:    cvt.f32.f16 %r5, %rs2;
+; CHECK-SM70-NEXT:    max.f32 %r6, %r5, 0f00000000;
+; CHECK-SM70-NEXT:    cvt.rn.f16.f32 %rs3, %r6;
+; CHECK-SM70-NEXT:    cvt.f32.f16 %r7, %rs1;
+; CHECK-SM70-NEXT:    max.f32 %r8, %r7, 0f00000000;
+; CHECK-SM70-NEXT:    cvt.rn.f16.f32 %rs4, %r8;
+; CHECK-SM70-NEXT:    mov.b32 %r9, {%rs4, %rs3};
+; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r9;
 ; CHECK-SM70-NEXT:    ret;
   %1 = call <2 x half> @llvm.fma.f16x2(<2 x half> %a, <2 x half> %b, <2 x half> %c)
   %2 = call <2 x half> @llvm.maxnum.f16x2(<2 x half> %1, <2 x half> <half 0.0, half 0.0>)
@@ -567,8 +541,7 @@ define <2 x bfloat> @fma_bf16x2_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x b
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<5>;
 ; CHECK-SM70-NEXT:    .reg .b16 %rs<11>;
-; CHECK-SM70-NEXT:    .reg .b32 %r<31>;
-; CHECK-SM70-NEXT:    .reg .b32 %f<11>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<51>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
 ; CHECK-SM70-NEXT:    ld.param.b32 %r1, [fma_bf16x2_no_nans_param_0];
@@ -576,53 +549,43 @@ define <2 x bfloat> @fma_bf16x2_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x b
 ; CHECK-SM70-NEXT:    ld.param.b32 %r3, [fma_bf16x2_no_nans_param_2];
 ; CHECK-SM70-NEXT:    mov.b32 {%rs1, %rs2}, %r3;
 ; CHECK-SM70-NEXT:    cvt.u32.u16 %r4, %rs1;
-; CHECK-SM70-NEXT:    shl.b32 %r5, %r4, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f1, %r5;
+; CHECK-SM70-NEXT:    shl.b32 %r41, %r4, 16;
 ; CHECK-SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r6, %rs3;
-; CHECK-SM70-NEXT:    shl.b32 %r7, %r6, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f2, %r7;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r7, %rs3;
+; CHECK-SM70-NEXT:    shl.b32 %r42, %r7, 16;
 ; CHECK-SM70-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r8, %rs5;
-; CHECK-SM70-NEXT:    shl.b32 %r9, %r8, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f3, %r9;
-; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
-; CHECK-SM70-NEXT:    mov.b32 %r10, %f4;
-; CHECK-SM70-NEXT:    bfe.u32 %r11, %r10, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r12, %r11, %r10;
-; CHECK-SM70-NEXT:    add.s32 %r13, %r12, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
-; CHECK-SM70-NEXT:    or.b32 %r14, %r10, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r15, %r14, %r13, %p1;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r15; }
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r16, %rs2;
-; CHECK-SM70-NEXT:    shl.b32 %r17, %r16, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f5, %r17;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r18, %rs4;
-; CHECK-SM70-NEXT:    shl.b32 %r19, %r18, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f6, %r19;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs6;
-; CHECK-SM70-NEXT:    shl.b32 %r21, %r20, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f7, %r21;
-; CHECK-SM70-NEXT:    fma.rn.f32 %f8, %f7, %f6, %f5;
-; CHECK-SM70-NEXT:    mov.b32 %r22, %f8;
-; CHECK-SM70-NEXT:    bfe.u32 %r23, %r22, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r24, %r23, %r22;
-; CHECK-SM70-NEXT:    add.s32 %r25, %r24, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f8, %f8;
-; CHECK-SM70-NEXT:    or.b32 %r26, %r22, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r27, %r26, %r25, %p2;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r27; }
-; CHECK-SM70-NEXT:    and.b32 %r28, %r15, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f9, %r28;
-; CHECK-SM70-NEXT:    setp.gt.f32 %p3, %f9, 0f00000000;
-; CHECK-SM70-NEXT:    and.b32 %r29, %r27, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f10, %r29;
-; CHECK-SM70-NEXT:    setp.gt.f32 %p4, %f10, 0f00000000;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r10, %rs5;
+; CHECK-SM70-NEXT:    shl.b32 %r43, %r10, 16;
+; CHECK-SM70-NEXT:    fma.rn.f32 %r44, %r43, %r42, %r41;
+; CHECK-SM70-NEXT:    bfe.u32 %r15, %r44, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r16, %r15, %r44;
+; CHECK-SM70-NEXT:    add.s32 %r17, %r16, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %r44, %r44;
+; CHECK-SM70-NEXT:    or.b32 %r18, %r44, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r19, %r18, %r17, %p1;
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r19; }
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs2;
+; CHECK-SM70-NEXT:    shl.b32 %r45, %r20, 16;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r23, %rs4;
+; CHECK-SM70-NEXT:    shl.b32 %r46, %r23, 16;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r26, %rs6;
+; CHECK-SM70-NEXT:    shl.b32 %r47, %r26, 16;
+; CHECK-SM70-NEXT:    fma.rn.f32 %r48, %r47, %r46, %r45;
+; CHECK-SM70-NEXT:    bfe.u32 %r31, %r48, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r32, %r31, %r48;
+; CHECK-SM70-NEXT:    add.s32 %r33, %r32, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %r48, %r48;
+; CHECK-SM70-NEXT:    or.b32 %r34, %r48, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r35, %r34, %r33, %p2;
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r35; }
+; CHECK-SM70-NEXT:    and.b32 %r49, %r19, -65536;
+; CHECK-SM70-NEXT:    setp.gt.f32 %p3, %r49, 0f00000000;
+; CHECK-SM70-NEXT:    and.b32 %r50, %r35, -65536;
+; CHECK-SM70-NEXT:    setp.gt.f32 %p4, %r50, 0f00000000;
 ; CHECK-SM70-NEXT:    selp.b16 %rs9, %rs8, 0x0000, %p4;
 ; CHECK-SM70-NEXT:    selp.b16 %rs10, %rs7, 0x0000, %p3;
-; CHECK-SM70-NEXT:    mov.b32 %r30, {%rs10, %rs9};
-; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r30;
+; CHECK-SM70-NEXT:    mov.b32 %r40, {%rs10, %rs9};
+; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r40;
 ; CHECK-SM70-NEXT:    ret;
   %1 = call <2 x bfloat> @llvm.fma.bf16x2(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
   %2 = fcmp ogt <2 x bfloat> %1, <bfloat 0.0, bfloat 0.0>
@@ -651,8 +614,7 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2
 ; CHECK-FTZ-LABEL: fma_bf16x2_no_nans_multiple_uses_of_fma(
 ; CHECK-FTZ:       {
 ; CHECK-FTZ-NEXT:    .reg .b16 %rs<5>;
-; CHECK-FTZ-NEXT:    .reg .b32 %r<14>;
-; CHECK-FTZ-NEXT:    .reg .b32 %f<9>;
+; CHECK-FTZ-NEXT:    .reg .b32 %r<26>;
 ; CHECK-FTZ-EMPTY:
 ; CHECK-FTZ-NEXT:  // %bb.0:
 ; CHECK-FTZ-NEXT:    ld.param.b32 %r1, [fma_bf16x2_no_nans_multiple_uses_of_fma_param_2];
@@ -661,33 +623,28 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2
 ; CHECK-FTZ-NEXT:    fma.rn.bf16x2 %r4, %r3, %r2, %r1;
 ; CHECK-FTZ-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
 ; CHECK-FTZ-NEXT:    cvt.u32.u16 %r5, %rs2;
-; CHECK-FTZ-NEXT:    shl.b32 %r6, %r5, 16;
-; CHECK-FTZ-NEXT:    mov.b32 %f1, %r6;
-; CHECK-FTZ-NEXT:    add.ftz.f32 %f2, %f1, 0f40E00000;
-; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs3, %f2;
-; CHECK-FTZ-NEXT:    cvt.u32.u16 %r7, %rs1;
-; CHECK-FTZ-NEXT:    shl.b32 %r8, %r7, 16;
-; CHECK-FTZ-NEXT:    mov.b32 %f3, %r8;
-; CHECK-FTZ-NEXT:    add.ftz.f32 %f4, %f3, 0f40E00000;
-; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs4, %f4;
-; CHECK-FTZ-NEXT:    cvt.u32.u16 %r9, %rs4;
-; CHECK-FTZ-NEXT:    shl.b32 %r10, %r9, 16;
-; CHECK-FTZ-NEXT:    mov.b32 %f5, %r10;
-; CHECK-FTZ-NEXT:    add.ftz.f32 %f6, %f5, %f3;
-; CHECK-FTZ-NEXT:    cvt.u32.u16 %r11, %rs3;
-; CHECK-FTZ-NEXT:    shl.b32 %r12, %r11, 16;
-; CHECK-FTZ-NEXT:    mov.b32 %f7, %r12;
-; CHECK-FTZ-NEXT:    add.ftz.f32 %f8, %f7, %f1;
-; CHECK-FTZ-NEXT:    cvt.rn.bf16x2.f32 %r13, %f8, %f6;
-; CHECK-FTZ-NEXT:    st.param.b32 [func_retval0], %r13;
+; CHECK-FTZ-NEXT:    shl.b32 %r22, %r5, 16;
+; CHECK-FTZ-NEXT:    add.ftz.f32 %r8, %r22, 0f40E00000;
+; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs3, %r8;
+; CHECK-FTZ-NEXT:    cvt.u32.u16 %r9, %rs1;
+; CHECK-FTZ-NEXT:    shl.b32 %r23, %r9, 16;
+; CHECK-FTZ-NEXT:    add.ftz.f32 %r12, %r23, 0f40E00000;
+; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs4, %r12;
+; CHECK-FTZ-NEXT:    cvt.u32.u16 %r13, %rs4;
+; CHECK-FTZ-NEXT:    shl.b32 %r24, %r13, 16;
+; CHECK-FTZ-NEXT:    add.ftz.f32 %r16, %r24, %r23;
+; CHECK-FTZ-NEXT:    cvt.u32.u16 %r17, %rs3;
+; CHECK-FTZ-NEXT:    shl.b32 %r25, %r17, 16;
+; CHECK-FTZ-NEXT:    add.ftz.f32 %r20, %r25, %r22;
+; CHECK-FTZ-NEXT:    cvt.rn.bf16x2.f32 %r21, %r20, %r16;
+; CHECK-FTZ-NEXT:    st.param.b32 [func_retval0], %r21;
 ; CHECK-FTZ-NEXT:    ret;
 ;
 ; CHECK-SM70-LABEL: fma_bf16x2_no_nans_multiple_uses_of_fma(
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<7>;
 ; CHECK-SM70-NEXT:    .reg .b16 %rs<7>;
-; CHECK-SM70-NEXT:    .reg .b32 %r<57>;
-; CHECK-SM70-NEXT:    .reg .b32 %f<17>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<89>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
 ; CHECK-SM70-NEXT:    ld.param.b32 %r1, [fma_bf16x2_no_nans_multiple_uses_of_fma_param_0];
@@ -695,83 +652,67 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2
 ; CHECK-SM70-NEXT:    ld.param.b32 %r3, [fma_bf16x2_no_nans_multiple_uses_of_fma_param_2];
 ; CHECK-SM70-NEXT:    mov.b32 {%rs1, %rs2}, %r3;
 ; CHECK-SM70-NEXT:    cvt.u32.u16 %r4, %rs2;
-; CHECK-SM70-NEXT:    shl.b32 %r5, %r4, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f1, %r5;
+; CHECK-SM70-NEXT:    shl.b32 %r73, %r4, 16;
 ; CHECK-SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r6, %rs4;
-; CHECK-SM70-NEXT:    shl.b32 %r7, %r6, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f2, %r7;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r7, %rs4;
+; CHECK-SM70-NEXT:    shl.b32 %r74, %r7, 16;
 ; CHECK-SM70-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r8, %rs6;
-; CHECK-SM70-NEXT:    shl.b32 %r9, %r8, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f3, %r9;
-; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
-; CHECK-SM70-NEXT:    mov.b32 %r10, %f4;
-; CHECK-SM70-NEXT:    bfe.u32 %r11, %r10, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r12, %r11, %r10;
-; CHECK-SM70-NEXT:    add.s32 %r13, %r12, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
-; CHECK-SM70-NEXT:    or.b32 %r14, %r10, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r15, %r14, %r13, %p1;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r16, %rs1;
-; CHECK-SM70-NEXT:    shl.b32 %r17, %r16, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f5, %r17;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r18, %rs3;
-; CHECK-SM70-NEXT:    shl.b32 %r19, %r18, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f6, %r19;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs5;
-; CHECK-SM70-NEXT:    shl.b32 %r21, %r20, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f7, %r21;
-; CHECK-SM70-NEXT:    fma.rn.f32 %f8, %f7, %f6, %f5;
-; CHECK-SM70-NEXT:    mov.b32 %r22, %f8;
-; CHECK-SM70-NEXT:    bfe.u32 %r23, %r22, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r24, %r23, %r22;
-; CHECK-SM70-NEXT:    add.s32 %r25, %r24, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f8, %f8;
-; CHECK-SM70-NEXT:    or.b32 %r26, %r22, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r27, %r26, %r25, %p2;
-; CHECK-SM70-NEXT:    and.b32 %r28, %r27, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f9, %r28;
-; CHECK-SM70-NEXT:    add.f32 %f10, %f9, 0f40E00000;
-; CHECK-SM70-NEXT:    mov.b32 %r29, %f10;
-; CHECK-SM70-NEXT:    bfe.u32 %r30, %r29, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r31, %r30, %r29;
-; CHECK-SM70-NEXT:    add.s32 %r32, %r31, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p3, %f10, %f10;
-; CHECK-SM70-NEXT:    or.b32 %r33, %r29, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r34, %r33, %r32, %p3;
-; CHECK-SM70-NEXT:    and.b32 %r35, %r15, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f11, %r35;
-; CHECK-SM70-NEXT:    add.f32 %f12, %f11, 0f40E00000;
-; CHECK-SM70-NEXT:    mov.b32 %r36, %f12;
-; CHECK-SM70-NEXT:    bfe.u32 %r37, %r36, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r38, %r37, %r36;
-; CHECK-SM70-NEXT:    add.s32 %r39, %r38, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p4, %f12, %f12;
-; CHECK-SM70-NEXT:    or.b32 %r40, %r36, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r41, %r40, %r39, %p4;
-; CHECK-SM70-NEXT:    and.b32 %r42, %r41, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f13, %r42;
-; CHECK-SM70-NEXT:    add.f32 %f14, %f13, %f11;
-; CHECK-SM70-NEXT:    mov.b32 %r43, %f14;
-; CHECK-SM70-NEXT:    bfe.u32 %r44, %r43, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r45, %r44, %r43;
-; CHECK-SM70-NEXT:    add.s32 %r46, %r45, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p5, %f14, %f14;
-; CHECK-SM70-NEXT:    or.b32 %r47, %r43, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r48, %r47, %r46, %p5;
-; CHECK-SM70-NEXT:    and.b32 %r49, %r34, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f15, %r49;
-; CHECK-SM70-NEXT:    add.f32 %f16, %f15, %f9;
-; CHECK-SM70-NEXT:    mov.b32 %r50, %f16;
-; CHECK-SM70-NEXT:    bfe.u32 %r51, %r50, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r52, %r51, %r50;
-; CHECK-SM70-NEXT:    add.s32 %r53, %r52, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p6, %f16, %f16;
-; CHECK-SM70-NEXT:    or.b32 %r54, %r50, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r55, %r54, %r53, %p6;
-; CHECK-SM70-NEXT:    prmt.b32 %r56, %r55, %r48, 0x7632U;
-; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r56;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r10, %rs6;
+; CHECK-SM70-NEXT:    shl.b32 %r75, %r10, 16;
+; CHECK-SM70-NEXT:    fma.rn.f32 %r76, %r75, %r74, %r73;
+; CHECK-SM70-NEXT:    bfe.u32 %r15, %r76, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r16, %r15, %r76;
+; CHECK-SM70-NEXT:    add.s32 %r17, %r16, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %r76, %r76;
+; CHECK-SM70-NEXT:    or.b32 %r18, %r76, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r19, %r18, %r17, %p1;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs1;
+; CHECK-SM70-NEXT:    shl.b32 %r77, %r20, 16;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r23, %rs3;
+; CHECK-SM70-NEXT:    shl.b32 %r78, %r23, 16;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r26, %rs5;
+; CHECK-SM70-NEXT:    shl.b32 %r79, %r26, 16;
+; CHECK-SM70-NEXT:    fma.rn.f32 %r80, %r79, %r78, %r77;
+; CHECK-SM70-NEXT:    bfe.u32 %r31, %r80, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r32, %r31, %r80;
+; CHECK-SM70-NEXT:    add.s32 %r33, %r32, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %r80, %r80;
+; CHECK-SM70-NEXT:    or.b32 %r34, %r80, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r35, %r34, %r33, %p2;
+; CHECK-SM70-NEXT:    and.b32 %r81, %r35, -65536;
+; CHECK-SM70-NEXT:    add.f32 %r82, %r81, 0f40E00000;
+; CHECK-SM70-NEXT:    bfe.u32 %r40, %r82, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r41, %r40, %r82;
+; CHECK-SM70-NEXT:    add.s32 %r42, %r41, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p3, %r82, %r82;
+; CHECK-SM70-NEXT:    or.b32 %r43, %r82, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r44, %r43, %r42, %p3;
+; CHECK-SM70-NEXT:    and.b32 %r83, %r19, -65536;
+; CHECK-SM70-NEXT:    add.f32 %r84, %r83, 0f40E00000;
+; CHECK-SM70-NEXT:    bfe.u32 %r49, %r84, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r50, %r49, %r84;
+; CHECK-SM70-NEXT:    add.s32 %r51, %r50, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p4, %r84, %r84;
+; CHECK-SM70-NEXT:    or.b32 %r52, %r84, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r53, %r52, %r51, %p4;
+; CHECK-SM70-NEXT:    and.b32 %r85, %r53, -65536;
+; CHECK-SM70-NEXT:    add.f32 %r86, %r85, %r83;
+; CHECK-SM70-NEXT:    bfe.u32 %r58, %r86, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r59, %r58, %r86;
+; CHECK-SM70-NEXT:    add.s32 %r60, %r59, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p5, %r86, %r86;
+; CHECK-SM70-NEXT:    or.b32 %r61, %r86, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r62, %r61, %r60, %p5;
+; CHECK-SM70-NEXT:    and.b32 %r87, %r44, -65536;
+; CHECK-SM70-NEXT:    add.f32 %r88, %r87, %r81;
+; CHECK-SM70-NEXT:    bfe.u32 %r67, %r88, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r68, %r67, %r88;
+; CHECK-SM70-NEXT:    add.s32 %r69, %r68, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p6, %r88, %r88;
+; CHECK-SM70-NEXT:    or.b32 %r70, %r88, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r71, %r70, %r69, %p6;
+; CHECK-SM70-NEXT:    prmt.b32 %r72, %r71, %r62, 0x7632U;
+; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r72;
 ; CHECK-SM70-NEXT:    ret;
   %1 = call <2 x bfloat> @llvm.fma.bf16x2(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
   %2 = fcmp ogt <2 x bfloat> %1, <bfloat 0.0, bfloat 0.0>
@@ -810,8 +751,7 @@ define <2 x bfloat> @fma_bf16x2_maxnum_no_nans(<2 x bfloat> %a, <2 x bfloat> %b,
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<5>;
 ; CHECK-SM70-NEXT:    .reg .b16 %rs<7>;
-; CHECK-SM70-NEXT:    .reg .b32 %r<43>;
-; CHECK-SM70-NEXT:    .reg .b32 %f<13>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<67>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
 ; CHECK-SM70-NEXT:    ld.param.b32 %r1, [fma_bf16x2_maxnum_no_nans_param_0];
@@ -819,63 +759,51 @@ define <2 x bfloat> @fma_bf16x2_maxnum_no_nans(<2 x bfloat> %a, <2 x bfloat> %b,
 ; CHECK-SM70-NEXT:    ld.param.b32 %r3, [fma_bf16x2_maxnum_no_nans_param_2];
 ; CHECK-SM70-NEXT:    mov.b32 {%rs1, %rs2}, %r3;
 ; CHECK-SM70-NEXT:    cvt.u32.u16 %r4, %rs1;
-; CHECK-SM70-NEXT:    shl.b32 %r5, %r4, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f1, %r5;
+; CHECK-SM70-NEXT:    shl.b32 %r55, %r4, 16;
 ; CHECK-SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r6, %rs3;
-; CHECK-SM70-NEXT:    shl.b32 %r7, %r6, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f2, %r7;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r7, %rs3;
+; CHECK-SM70-NEXT:    shl.b32 %r56, %r7, 16;
 ; CHECK-SM70-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r8, %rs5;
-; CHECK-SM70-NEXT:    shl.b32 %r9, %r8, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f3, %r9;
-; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
-; CHECK-SM70-NEXT:    mov.b32 %r10, %f4;
-; CHECK-SM70-NEXT:    bfe.u32 %r11, %r10, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r12, %r11, %r10;
-; CHECK-SM70-NEXT:    add.s32 %r13, %r12, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
-; CHECK-SM70-NEXT:    or.b32 %r14, %r10, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r15, %r14, %r13, %p1;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r16, %rs2;
-; CHECK-SM70-NEXT:    shl.b32 %r17, %r16, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f5, %r17;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r18, %rs4;
-; CHECK-SM70-NEXT:    shl.b32 %r19, %r18, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f6, %r19;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs6;
-; CHECK-SM70-NEXT:    shl.b32 %r21, %r20, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f7, %r21;
-; CHECK-SM70-NEXT:    fma.rn.f32 %f8, %f7, %f6, %f5;
-; CHECK-SM70-NEXT:    mov.b32 %r22, %f8;
-; CHECK-SM70-NEXT:    bfe.u32 %r23, %r22, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r24, %r23, %r22;
-; CHECK-SM70-NEXT:    add.s32 %r25, %r24, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f8, %f8;
-; CHECK-SM70-NEXT:    or.b32 %r26, %r22, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r27, %r26, %r25, %p2;
-; CHECK-SM70-NEXT:    and.b32 %r28, %r27, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f9, %r28;
-; CHECK-SM70-NEXT:    max.f32 %f10, %f9, 0f00000000;
-; CHECK-SM70-NEXT:    mov.b32 %r29, %f10;
-; CHECK-SM70-NEXT:    bfe.u32 %r30, %r29, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r31, %r30, %r29;
-; CHECK-SM70-NEXT:    add.s32 %r32, %r31, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p3, %f10, %f10;
-; CHECK-SM70-NEXT:    or.b32 %r33, %r29, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r34, %r33, %r32, %p3;
-; CHECK-SM70-NEXT:    and.b32 %r35, %r15, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f11, %r35;
-; CHECK-SM70-NEXT:    max.f32 %f12, %f11, 0f00000000;
-; CHECK-SM70-NEXT:    mov.b32 %r36, %f12;
-; CHECK-SM70-NEXT:    bfe.u32 %r37, %r36, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r38, %r37, %r36;
-; CHECK-SM70-NEXT:    add.s32 %r39, %r38, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p4, %f12, %f12;
-; CHECK-SM70-NEXT:    or.b32 %r40, %r36, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r41, %r40, %r39, %p4;
-; CHECK-SM70-NEXT:    prmt.b32 %r42, %r41, %r34, 0x7632U;
-; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r42;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r10, %rs5;
+; CHECK-SM70-NEXT:    shl.b32 %r57, %r10, 16;
+; CHECK-SM70-NEXT:    fma.rn.f32 %r58, %r57, %r56, %r55;
+; CHECK-SM70-NEXT:    bfe.u32 %r15, %r58, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r16, %r15, %r58;
+; CHECK-SM70-NEXT:    add.s32 %r17, %r16, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %r58, %r58;
+; CHECK-SM70-NEXT:    or.b32 %r18, %r58, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r19, %r18, %r17, %p1;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs2;
+; CHECK-SM70-NEXT:    shl.b32 %r59, %r20, 16;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r23, %rs4;
+; CHECK-SM70-NEXT:    shl.b32 %r60, %r23, 16;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r26, %rs6;
+; CHECK-SM70-NEXT:    shl.b32 %r61, %r26, 16;
+; CHECK-SM70-NEXT:    fma.rn.f32 %r62, %r61, %r60, %r59;
+; CHECK-SM70-NEXT:    bfe.u32 %r31, %r62, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r32, %r31, %r62;
+; CHECK-SM70-NEXT:    add.s32 %r33, %r32, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %r62, %r62;
+; CHECK-SM70-NEXT:    or.b32 %r34, %r62, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r35, %r34, %r33, %p2;
+; CHECK-SM70-NEXT:    and.b32 %r63, %r35, -65536;
+; CHECK-SM70-NEXT:    max.f32 %r64, %r63, 0f00000000;
+; CHECK-SM70-NEXT:    bfe.u32 %r40, %r64, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r41, %r40, %r64;
+; CHECK-SM70-NEXT:    add.s32 %r42, %r41, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p3, %r64, %r64;
+; CHECK-SM70-NEXT:    or.b32 %r43, %r64, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r44, %r43, %r42, %p3;
+; CHECK-SM70-NEXT:    and.b32 %r65, %r19, -65536;
+; CHECK-SM70-NEXT:    max.f32 %r66, %r65, 0f00000000;
+; CHECK-SM70-NEXT:    bfe.u32 %r49, %r66, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r50, %r49, %r66;
+; CHECK-SM70-NEXT:    add.s32 %r51, %r50, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p4, %r66, %r66;
+; CHECK-SM70-NEXT:    or.b32 %r52, %r66, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r53, %r52, %r51, %p4;
+; CHECK-SM70-NEXT:    prmt.b32 %r54, %r53, %r44, 0x7632U;
+; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r54;
 ; CHECK-SM70-NEXT:    ret;
   %1 = call <2 x bfloat> @llvm.fma.bf16x2(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
   %2 = call <2 x bfloat> @llvm.maxnum.bf16x2(<2 x bfloat> %1, <2 x bfloat> <bfloat 0.0, bfloat 0.0>)
diff --git a/llvm/test/CodeGen/NVPTX/fma-relu-instruction-flag.ll b/llvm/test/CodeGen/NVPTX/fma-relu-instruction-flag.ll
index 05f7840dc3aa0..3d70686951fee 100644
--- a/llvm/test/CodeGen/NVPTX/fma-relu-instruction-flag.ll
+++ b/llvm/test/CodeGen/NVPTX/fma-relu-instruction-flag.ll
@@ -147,16 +147,16 @@ define half @fma_f16_expanded_maxnum_no_nans(half %a, half %b, half %c)  {
 ; CHECK-SM70-LABEL: fma_f16_expanded_maxnum_no_nans(
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .b16 %rs<6>;
-; CHECK-SM70-NEXT:    .reg .b32 %f<3>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<3>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
 ; CHECK-SM70-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_maxnum_no_nans_param_0];
 ; CHECK-SM70-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_maxnum_no_nans_param_1];
 ; CHECK-SM70-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_maxnum_no_nans_param_2];
 ; CHECK-SM70-NEXT:    fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
-; CHECK-SM70-NEXT:    cvt.f32.f16 %f1, %rs4;
-; CHECK-SM70-NEXT:    max.f32 %f2, %f1, 0f00000000;
-; CHECK-SM70-NEXT:    cvt.rn.f16.f32 %rs5, %f2;
+; CHECK-SM70-NEXT:    cvt.f32.f16 %r1, %rs4;
+; CHECK-SM70-NEXT:    max.f32 %r2, %r1, 0f00000000;
+; CHECK-SM70-NEXT:    cvt.rn.f16.f32 %rs5, %r2;
 ; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs5;
 ; CHECK-SM70-NEXT:    ret;
   %1 = fmul fast half %a, %b
@@ -194,31 +194,25 @@ define bfloat @fma_bf16_expanded_no_nans(bfloat %a, bfloat %b, bfloat %c)  {
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<3>;
 ; CHECK-SM70-NEXT:    .reg .b16 %rs<3>;
-; CHECK-SM70-NEXT:    .reg .b32 %r<14>;
-; CHECK-SM70-NEXT:    .reg .b32 %f<6>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<24>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
 ; CHECK-SM70-NEXT:    ld.param.b16 %r1, [fma_bf16_expanded_no_nans_param_2];
-; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
-; CHECK-SM70-NEXT:    ld.param.b16 %r3, [fma_bf16_expanded_no_nans_param_1];
-; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
-; CHECK-SM70-NEXT:    ld.param.b16 %r5, [fma_bf16_expanded_no_nans_param_0];
-; CHECK-SM70-NEXT:    shl.b32 %r6, %r5, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f3, %r6;
-; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
-; CHECK-SM70-NEXT:    mov.b32 %r7, %f4;
-; CHECK-SM70-NEXT:    bfe.u32 %r8, %r7, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r9, %r8, %r7;
-; CHECK-SM70-NEXT:    add.s32 %r10, %r9, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
-; CHECK-SM70-NEXT:    or.b32 %r11, %r7, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r12; }
-; CHECK-SM70-NEXT:    and.b32 %r13, %r12, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f5, %r13;
-; CHECK-SM70-NEXT:    setp.gt.f32 %p2, %f5, 0f00000000;
+; CHECK-SM70-NEXT:    shl.b32 %r19, %r1, 16;
+; CHECK-SM70-NEXT:    ld.param.b16 %r4, [fma_bf16_expanded_no_nans_param_1];
+; CHECK-SM70-NEXT:    shl.b32 %r20, %r4, 16;
+; CHECK-SM70-NEXT:    ld.param.b16 %r7, [fma_bf16_expanded_no_nans_param_0];
+; CHECK-SM70-NEXT:    shl.b32 %r21, %r7, 16;
+; CHECK-SM70-NEXT:    fma.rn.f32 %r22, %r21, %r20, %r19;
+; CHECK-SM70-NEXT:    bfe.u32 %r12, %r22, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r13, %r12, %r22;
+; CHECK-SM70-NEXT:    add.s32 %r14, %r13, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %r22, %r22;
+; CHECK-SM70-NEXT:    or.b32 %r15, %r22, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r16, %r15, %r14, %p1;
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r16; }
+; CHECK-SM70-NEXT:    and.b32 %r23, %r16, -65536;
+; CHECK-SM70-NEXT:    setp.gt.f32 %p2, %r23, 0f00000000;
 ; CHECK-SM70-NEXT:    selp.b16 %rs2, %rs1, 0x0000, %p2;
 ; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs2;
 ; CHECK-SM70-NEXT:    ret;
@@ -252,8 +246,7 @@ define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat
 ; CHECK-FTZ-LABEL: fma_bf16_expanded_no_nans_multiple_uses_of_fma(
 ; CHECK-FTZ:       {
 ; CHECK-FTZ-NEXT:    .reg .b16 %rs<9>;
-; CHECK-FTZ-NEXT:    .reg .b32 %r<7>;
-; CHECK-FTZ-NEXT:    .reg .b32 %f<6>;
+; CHECK-FTZ-NEXT:    .reg .b32 %r<15>;
 ; CHECK-FTZ-EMPTY:
 ; CHECK-FTZ-NEXT:  // %bb.0:
 ; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_0];
@@ -263,18 +256,15 @@ define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat
 ; CHECK-FTZ-NEXT:    mov.b16 %rs5, 0x0000;
 ; CHECK-FTZ-NEXT:    max.bf16 %rs6, %rs4, %rs5;
 ; CHECK-FTZ-NEXT:    cvt.u32.u16 %r1, %rs4;
-; CHECK-FTZ-NEXT:    shl.b32 %r2, %r1, 16;
-; CHECK-FTZ-NEXT:    mov.b32 %f1, %r2;
-; CHECK-FTZ-NEXT:    add.rn.ftz.f32 %f2, %f1, 0f40E00000;
-; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs7, %f2;
-; CHECK-FTZ-NEXT:    cvt.u32.u16 %r3, %rs6;
-; CHECK-FTZ-NEXT:    shl.b32 %r4, %r3, 16;
-; CHECK-FTZ-NEXT:    mov.b32 %f3, %r4;
-; CHECK-FTZ-NEXT:    cvt.u32.u16 %r5, %rs7;
-; CHECK-FTZ-NEXT:    shl.b32 %r6, %r5, 16;
-; CHECK-FTZ-NEXT:    mov.b32 %f4, %r6;
-; CHECK-FTZ-NEXT:    add.rn.ftz.f32 %f5, %f3, %f4;
-; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs8, %f5;
+; CHECK-FTZ-NEXT:    shl.b32 %r12, %r1, 16;
+; CHECK-FTZ-NEXT:    add.rn.ftz.f32 %r4, %r12, 0f40E00000;
+; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs7, %r4;
+; CHECK-FTZ-NEXT:    cvt.u32.u16 %r5, %rs6;
+; CHECK-FTZ-NEXT:    shl.b32 %r13, %r5, 16;
+; CHECK-FTZ-NEXT:    cvt.u32.u16 %r8, %rs7;
+; CHECK-FTZ-NEXT:    shl.b32 %r14, %r8, 16;
+; CHECK-FTZ-NEXT:    add.rn.ftz.f32 %r11, %r13, %r14;
+; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs8, %r11;
 ; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs8;
 ; CHECK-FTZ-NEXT:    ret;
 ;
@@ -282,54 +272,44 @@ define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<5>;
 ; CHECK-SM70-NEXT:    .reg .b16 %rs<4>;
-; CHECK-SM70-NEXT:    .reg .b32 %r<29>;
-; CHECK-SM70-NEXT:    .reg .b32 %f<10>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<47>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
 ; CHECK-SM70-NEXT:    ld.param.b16 %r1, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_2];
-; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
-; CHECK-SM70-NEXT:    ld.param.b16 %r3, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_1];
-; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
-; CHECK-SM70-NEXT:    ld.param.b16 %r5, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_0];
-; CHECK-SM70-NEXT:    shl.b32 %r6, %r5, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f3, %r6;
-; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
-; CHECK-SM70-NEXT:    mov.b32 %r7, %f4;
-; CHECK-SM70-NEXT:    bfe.u32 %r8, %r7, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r9, %r8, %r7;
-; CHECK-SM70-NEXT:    add.s32 %r10, %r9, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
-; CHECK-SM70-NEXT:    or.b32 %r11, %r7, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r12; }
-; CHECK-SM70-NEXT:    and.b32 %r13, %r12, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f5, %r13;
-; CHECK-SM70-NEXT:    setp.gt.f32 %p2, %f5, 0f00000000;
+; CHECK-SM70-NEXT:    shl.b32 %r38, %r1, 16;
+; CHECK-SM70-NEXT:    ld.param.b16 %r4, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_1];
+; CHECK-SM70-NEXT:    shl.b32 %r39, %r4, 16;
+; CHECK-SM70-NEXT:    ld.param.b16 %r7, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_0];
+; CHECK-SM70-NEXT:    shl.b32 %r40, %r7, 16;
+; CHECK-SM70-NEXT:    fma.rn.f32 %r41, %r40, %r39, %r38;
+; CHECK-SM70-NEXT:    bfe.u32 %r12, %r41, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r13, %r12, %r41;
+; CHECK-SM70-NEXT:    add.s32 %r14, %r13, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %r41, %r41;
+; CHECK-SM70-NEXT:    or.b32 %r15, %r41, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r16, %r15, %r14, %p1;
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r16; }
+; CHECK-SM70-NEXT:    and.b32 %r42, %r16, -65536;
+; CHECK-SM70-NEXT:    setp.gt.f32 %p2, %r42, 0f00000000;
 ; CHECK-SM70-NEXT:    selp.b16 %rs2, %rs1, 0x0000, %p2;
-; CHECK-SM70-NEXT:    add.rn.f32 %f6, %f5, 0f40E00000;
-; CHECK-SM70-NEXT:    mov.b32 %r14, %f6;
-; CHECK-SM70-NEXT:    bfe.u32 %r15, %r14, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r16, %r15, %r14;
-; CHECK-SM70-NEXT:    add.s32 %r17, %r16, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p3, %f6, %f6;
-; CHECK-SM70-NEXT:    or.b32 %r18, %r14, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r19, %r18, %r17, %p3;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs2;
-; CHECK-SM70-NEXT:    shl.b32 %r21, %r20, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f7, %r21;
-; CHECK-SM70-NEXT:    and.b32 %r22, %r19, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f8, %r22;
-; CHECK-SM70-NEXT:    add.rn.f32 %f9, %f7, %f8;
-; CHECK-SM70-NEXT:    mov.b32 %r23, %f9;
-; CHECK-SM70-NEXT:    bfe.u32 %r24, %r23, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r25, %r24, %r23;
-; CHECK-SM70-NEXT:    add.s32 %r26, %r25, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p4, %f9, %f9;
-; CHECK-SM70-NEXT:    or.b32 %r27, %r23, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r28, %r27, %r26, %p4;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs3}, %r28; }
+; CHECK-SM70-NEXT:    add.rn.f32 %r43, %r42, 0f40E00000;
+; CHECK-SM70-NEXT:    bfe.u32 %r21, %r43, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r22, %r21, %r43;
+; CHECK-SM70-NEXT:    add.s32 %r23, %r22, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p3, %r43, %r43;
+; CHECK-SM70-NEXT:    or.b32 %r24, %r43, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r25, %r24, %r23, %p3;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r26, %rs2;
+; CHECK-SM70-NEXT:    shl.b32 %r44, %r26, 16;
+; CHECK-SM70-NEXT:    and.b32 %r45, %r25, -65536;
+; CHECK-SM70-NEXT:    add.rn.f32 %r46, %r44, %r45;
+; CHECK-SM70-NEXT:    bfe.u32 %r33, %r46, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r34, %r33, %r46;
+; CHECK-SM70-NEXT:    add.s32 %r35, %r34, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p4, %r46, %r46;
+; CHECK-SM70-NEXT:    or.b32 %r36, %r46, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r37, %r36, %r35, %p4;
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs3}, %r37; }
 ; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs3;
 ; CHECK-SM70-NEXT:    ret;
   %1 = fmul fast bfloat %a, %b
@@ -372,38 +352,31 @@ define bfloat @fma_bf16_expanded_maxnum_no_nans(bfloat %a, bfloat %b, bfloat %c)
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<3>;
 ; CHECK-SM70-NEXT:    .reg .b16 %rs<2>;
-; CHECK-SM70-NEXT:    .reg .b32 %r<20>;
-; CHECK-SM70-NEXT:    .reg .b32 %f<7>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<32>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
 ; CHECK-SM70-NEXT:    ld.param.b16 %r1, [fma_bf16_expanded_maxnum_no_nans_param_2];
-; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
-; CHECK-SM70-NEXT:    ld.param.b16 %r3, [fma_bf16_expanded_maxnum_no_nans_param_1];
-; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
-; CHECK-SM70-NEXT:    ld.param.b16 %r5, [fma_bf16_expanded_maxnum_no_nans_param_0];
-; CHECK-SM70-NEXT:    shl.b32 %r6, %r5, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f3, %r6;
-; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
-; CHECK-SM70-NEXT:    mov.b32 %r7, %f4;
-; CHECK-SM70-NEXT:    bfe.u32 %r8, %r7, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r9, %r8, %r7;
-; CHECK-SM70-NEXT:    add.s32 %r10, %r9, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
-; CHECK-SM70-NEXT:    or.b32 %r11, %r7, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
-; CHECK-SM70-NEXT:    and.b32 %r13, %r12, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f5, %r13;
-; CHECK-SM70-NEXT:    max.f32 %f6, %f5, 0f00000000;
-; CHECK-SM70-NEXT:    mov.b32 %r14, %f6;
-; CHECK-SM70-NEXT:    bfe.u32 %r15, %r14, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r16, %r15, %r14;
-; CHECK-SM70-NEXT:    add.s32 %r17, %r16, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f6, %f6;
-; CHECK-SM70-NEXT:    or.b32 %r18, %r14, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r19, %r18, %r17, %p2;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r19; }
+; CHECK-SM70-NEXT:    shl.b32 %r26, %r1, 16;
+; CHECK-SM70-NEXT:    ld.param.b16 %r4, [fma_bf16_expanded_maxnum_no_nans_param_1];
+; CHECK-SM70-NEXT:    shl.b32 %r27, %r4, 16;
+; CHECK-SM70-NEXT:    ld.param.b16 %r7, [fma_bf16_expanded_maxnum_no_nans_param_0];
+; CHECK-SM70-NEXT:    shl.b32 %r28, %r7, 16;
+; CHECK-SM70-NEXT:    fma.rn.f32 %r29, %r28, %r27, %r26;
+; CHECK-SM70-NEXT:    bfe.u32 %r12, %r29, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r13, %r12, %r29;
+; CHECK-SM70-NEXT:    add.s32 %r14, %r13, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %r29, %r29;
+; CHECK-SM70-NEXT:    or.b32 %r15, %r29, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r16, %r15, %r14, %p1;
+; CHECK-SM70-NEXT:    and.b32 %r30, %r16, -65536;
+; CHECK-SM70-NEXT:    max.f32 %r31, %r30, 0f00000000;
+; CHECK-SM70-NEXT:    bfe.u32 %r21, %r31, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r22, %r21, %r31;
+; CHECK-SM70-NEXT:    add.s32 %r23, %r22, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %r31, %r31;
+; CHECK-SM70-NEXT:    or.b32 %r24, %r31, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r25, %r24, %r23, %p2;
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r25; }
 ; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; CHECK-SM70-NEXT:    ret;
   %1 = fmul fast bfloat %a, %b
@@ -562,8 +535,7 @@ define <2 x half> @fma_f16x2_expanded_maxnum_no_nans(<2 x half> %a, <2 x half> %
 ; CHECK-SM70-LABEL: fma_f16x2_expanded_maxnum_no_nans(
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .b16 %rs<5>;
-; CHECK-SM70-NEXT:    .reg .b32 %r<6>;
-; CHECK-SM70-NEXT:    .reg .b32 %f<5>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<10>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
 ; CHECK-SM70-NEXT:    ld.param.b32 %r1, [fma_f16x2_expanded_maxnum_no_nans_param_2];
@@ -571,14 +543,14 @@ define <2 x half> @fma_f16x2_expanded_maxnum_no_nans(<2 x half> %a, <2 x half> %
 ; CHECK-SM70-NEXT:    ld.param.b32 %r3, [fma_f16x2_expanded_maxnum_no_nans_param_0];
 ; CHECK-SM70-NEXT:    fma.rn.f16x2 %r4, %r3, %r2, %r1;
 ; CHECK-SM70-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
-; CHECK-SM70-NEXT:    cvt.f32.f16 %f1, %rs2;
-; CHECK-SM70-NEXT:    max.f32 %f2, %f1, 0f00000000;
-; CHECK-SM70-NEXT:    cvt.rn.f16.f32 %rs3, %f2;
-; CHECK-SM70-NEXT:    cvt.f32.f16 %f3, %rs1;
-; CHECK-SM70-NEXT:    max.f32 %f4, %f3, 0f00000000;
-; CHECK-SM70-NEXT:    cvt.rn.f16.f32 %rs4, %f4;
-; CHECK-SM70-NEXT:    mov.b32 %r5, {%rs4, %rs3};
-; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r5;
+; CHECK-SM70-NEXT:    cvt.f32.f16 %r5, %rs2;
+; CHECK-SM70-NEXT:    max.f32 %r6, %r5, 0f00000000;
+; CHECK-SM70-NEXT:    cvt.rn.f16.f32 %rs3, %r6;
+; CHECK-SM70-NEXT:    cvt.f32.f16 %r7, %rs1;
+; CHECK-SM70-NEXT:    max.f32 %r8, %r7, 0f00000000;
+; CHECK-SM70-NEXT:    cvt.rn.f16.f32 %rs4, %r8;
+; CHECK-SM70-NEXT:    mov.b32 %r9, {%rs4, %rs3};
+; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r9;
 ; CHECK-SM70-NEXT:    ret;
   %1 = fmul fast <2 x half> %a, %b
   %2 = fadd fast <2 x half> %1, %c
@@ -615,8 +587,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans(<2 x bfloat> %a, <2 x bfloat> %
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<5>;
 ; CHECK-SM70-NEXT:    .reg .b16 %rs<11>;
-; CHECK-SM70-NEXT:    .reg .b32 %r<31>;
-; CHECK-SM70-NEXT:    .reg .b32 %f<11>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<51>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
 ; CHECK-SM70-NEXT:    ld.param.b32 %r1, [fma_bf16x2_expanded_no_nans_param_0];
@@ -624,53 +595,43 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans(<2 x bfloat> %a, <2 x bfloat> %
 ; CHECK-SM70-NEXT:    ld.param.b32 %r3, [fma_bf16x2_expanded_no_nans_param_2];
 ; CHECK-SM70-NEXT:    mov.b32 {%rs1, %rs2}, %r3;
 ; CHECK-SM70-NEXT:    cvt.u32.u16 %r4, %rs1;
-; CHECK-SM70-NEXT:    shl.b32 %r5, %r4, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f1, %r5;
+; CHECK-SM70-NEXT:    shl.b32 %r41, %r4, 16;
 ; CHECK-SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r6, %rs3;
-; CHECK-SM70-NEXT:    shl.b32 %r7, %r6, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f2, %r7;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r7, %rs3;
+; CHECK-SM70-NEXT:    shl.b32 %r42, %r7, 16;
 ; CHECK-SM70-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r8, %rs5;
-; CHECK-SM70-NEXT:    shl.b32 %r9, %r8, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f3, %r9;
-; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
-; CHECK-SM70-NEXT:    mov.b32 %r10, %f4;
-; CHECK-SM70-NEXT:    bfe.u32 %r11, %r10, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r12, %r11, %r10;
-; CHECK-SM70-NEXT:    add.s32 %r13, %r12, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
-; CHECK-SM70-NEXT:    or.b32 %r14, %r10, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r15, %r14, %r13, %p1;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r15; }
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r16, %rs2;
-; CHECK-SM70-NEXT:    shl.b32 %r17, %r16, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f5, %r17;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r18, %rs4;
-; CHECK-SM70-NEXT:    shl.b32 %r19, %r18, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f6, %r19;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs6;
-; CHECK-SM70-NEXT:    shl.b32 %r21, %r20, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f7, %r21;
-; CHECK-SM70-NEXT:    fma.rn.f32 %f8, %f7, %f6, %f5;
-; CHECK-SM70-NEXT:    mov.b32 %r22, %f8;
-; CHECK-SM70-NEXT:    bfe.u32 %r23, %r22, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r24, %r23, %r22;
-; CHECK-SM70-NEXT:    add.s32 %r25, %r24, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f8, %f8;
-; CHECK-SM70-NEXT:    or.b32 %r26, %r22, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r27, %r26, %r25, %p2;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r27; }
-; CHECK-SM70-NEXT:    and.b32 %r28, %r15, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f9, %r28;
-; CHECK-SM70-NEXT:    setp.gt.f32 %p3, %f9, 0f00000000;
-; CHECK-SM70-NEXT:    and.b32 %r29, %r27, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f10, %r29;
-; CHECK-SM70-NEXT:    setp.gt.f32 %p4, %f10, 0f00000000;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r10, %rs5;
+; CHECK-SM70-NEXT:    shl.b32 %r43, %r10, 16;
+; CHECK-SM70-NEXT:    fma.rn.f32 %r44, %r43, %r42, %r41;
+; CHECK-SM70-NEXT:    bfe.u32 %r15, %r44, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r16, %r15, %r44;
+; CHECK-SM70-NEXT:    add.s32 %r17, %r16, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %r44, %r44;
+; CHECK-SM70-NEXT:    or.b32 %r18, %r44, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r19, %r18, %r17, %p1;
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r19; }
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs2;
+; CHECK-SM70-NEXT:    shl.b32 %r45, %r20, 16;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r23, %rs4;
+; CHECK-SM70-NEXT:    shl.b32 %r46, %r23, 16;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r26, %rs6;
+; CHECK-SM70-NEXT:    shl.b32 %r47, %r26, 16;
+; CHECK-SM70-NEXT:    fma.rn.f32 %r48, %r47, %r46, %r45;
+; CHECK-SM70-NEXT:    bfe.u32 %r31, %r48, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r32, %r31, %r48;
+; CHECK-SM70-NEXT:    add.s32 %r33, %r32, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %r48, %r48;
+; CHECK-SM70-NEXT:    or.b32 %r34, %r48, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r35, %r34, %r33, %p2;
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r35; }
+; CHECK-SM70-NEXT:    and.b32 %r49, %r19, -65536;
+; CHECK-SM70-NEXT:    setp.gt.f32 %p3, %r49, 0f00000000;
+; CHECK-SM70-NEXT:    and.b32 %r50, %r35, -65536;
+; CHECK-SM70-NEXT:    setp.gt.f32 %p4, %r50, 0f00000000;
 ; CHECK-SM70-NEXT:    selp.b16 %rs9, %rs8, 0x0000, %p4;
 ; CHECK-SM70-NEXT:    selp.b16 %rs10, %rs7, 0x0000, %p3;
-; CHECK-SM70-NEXT:    mov.b32 %r30, {%rs10, %rs9};
-; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r30;
+; CHECK-SM70-NEXT:    mov.b32 %r40, {%rs10, %rs9};
+; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r40;
 ; CHECK-SM70-NEXT:    ret;
   %1 = fmul fast <2 x bfloat> %a, %b
   %2 = fadd fast <2 x bfloat> %1, %c
@@ -702,8 +663,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa
 ; CHECK-FTZ-LABEL: fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(
 ; CHECK-FTZ:       {
 ; CHECK-FTZ-NEXT:    .reg .b16 %rs<7>;
-; CHECK-FTZ-NEXT:    .reg .b32 %r<20>;
-; CHECK-FTZ-NEXT:    .reg .b32 %f<11>;
+; CHECK-FTZ-NEXT:    .reg .b32 %r<36>;
 ; CHECK-FTZ-EMPTY:
 ; CHECK-FTZ-NEXT:  // %bb.0:
 ; CHECK-FTZ-NEXT:    ld.param.b32 %r1, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_2];
@@ -714,40 +674,33 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa
 ; CHECK-FTZ-NEXT:    max.bf16x2 %r6, %r4, %r5;
 ; CHECK-FTZ-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
 ; CHECK-FTZ-NEXT:    cvt.u32.u16 %r7, %rs2;
-; CHECK-FTZ-NEXT:    shl.b32 %r8, %r7, 16;
-; CHECK-FTZ-NEXT:    mov.b32 %f1, %r8;
-; CHECK-FTZ-NEXT:    add.rn.ftz.f32 %f2, %f1, 0f40E00000;
-; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs3, %f2;
-; CHECK-FTZ-NEXT:    cvt.u32.u16 %r9, %rs1;
-; CHECK-FTZ-NEXT:    shl.b32 %r10, %r9, 16;
-; CHECK-FTZ-NEXT:    mov.b32 %f3, %r10;
-; CHECK-FTZ-NEXT:    add.rn.ftz.f32 %f4, %f3, 0f40E00000;
-; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs4, %f4;
+; CHECK-FTZ-NEXT:    shl.b32 %r30, %r7, 16;
+; CHECK-FTZ-NEXT:    add.rn.ftz.f32 %r10, %r30, 0f40E00000;
+; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs3, %r10;
+; CHECK-FTZ-NEXT:    cvt.u32.u16 %r11, %rs1;
+; CHECK-FTZ-NEXT:    shl.b32 %r31, %r11, 16;
+; CHECK-FTZ-NEXT:    add.rn.ftz.f32 %r14, %r31, 0f40E00000;
+; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs4, %r14;
 ; CHECK-FTZ-NEXT:    mov.b32 {%rs5, %rs6}, %r6;
-; CHECK-FTZ-NEXT:    cvt.u32.u16 %r11, %rs5;
-; CHECK-FTZ-NEXT:    shl.b32 %r12, %r11, 16;
-; CHECK-FTZ-NEXT:    mov.b32 %f5, %r12;
-; CHECK-FTZ-NEXT:    cvt.u32.u16 %r13, %rs4;
-; CHECK-FTZ-NEXT:    shl.b32 %r14, %r13, 16;
-; CHECK-FTZ-NEXT:    mov.b32 %f6, %r14;
-; CHECK-FTZ-NEXT:    add.rn.ftz.f32 %f7, %f5, %f6;
-; CHECK-FTZ-NEXT:    cvt.u32.u16 %r15, %rs6;
-; CHECK-FTZ-NEXT:    shl.b32 %r16, %r15, 16;
-; CHECK-FTZ-NEXT:    mov.b32 %f8, %r16;
-; CHECK-FTZ-NEXT:    cvt.u32.u16 %r17, %rs3;
-; CHECK-FTZ-NEXT:    shl.b32 %r18, %r17, 16;
-; CHECK-FTZ-NEXT:    mov.b32 %f9, %r18;
-; CHECK-FTZ-NEXT:    add.rn.ftz.f32 %f10, %f8, %f9;
-; CHECK-FTZ-NEXT:    cvt.rn.bf16x2.f32 %r19, %f10, %f7;
-; CHECK-FTZ-NEXT:    st.param.b32 [func_retval0], %r19;
+; CHECK-FTZ-NEXT:    cvt.u32.u16 %r15, %rs5;
+; CHECK-FTZ-NEXT:    shl.b32 %r32, %r15, 16;
+; CHECK-FTZ-NEXT:    cvt.u32.u16 %r18, %rs4;
+; CHECK-FTZ-NEXT:    shl.b32 %r33, %r18, 16;
+; CHECK-FTZ-NEXT:    add.rn.ftz.f32 %r21, %r32, %r33;
+; CHECK-FTZ-NEXT:    cvt.u32.u16 %r22, %rs6;
+; CHECK-FTZ-NEXT:    shl.b32 %r34, %r22, 16;
+; CHECK-FTZ-NEXT:    cvt.u32.u16 %r25, %rs3;
+; CHECK-FTZ-NEXT:    shl.b32 %r35, %r25, 16;
+; CHECK-FTZ-NEXT:    add.rn.ftz.f32 %r28, %r34, %r35;
+; CHECK-FTZ-NEXT:    cvt.rn.bf16x2.f32 %r29, %r28, %r21;
+; CHECK-FTZ-NEXT:    st.param.b32 [func_retval0], %r29;
 ; CHECK-FTZ-NEXT:    ret;
 ;
 ; CHECK-SM70-LABEL: fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<9>;
 ; CHECK-SM70-NEXT:    .reg .b16 %rs<11>;
-; CHECK-SM70-NEXT:    .reg .b32 %r<61>;
-; CHECK-SM70-NEXT:    .reg .b32 %f<19>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<97>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
 ; CHECK-SM70-NEXT:    ld.param.b32 %r1, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_0];
@@ -755,95 +708,77 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa
 ; CHECK-SM70-NEXT:    ld.param.b32 %r3, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_2];
 ; CHECK-SM70-NEXT:    mov.b32 {%rs1, %rs2}, %r3;
 ; CHECK-SM70-NEXT:    cvt.u32.u16 %r4, %rs2;
-; CHECK-SM70-NEXT:    shl.b32 %r5, %r4, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f1, %r5;
+; CHECK-SM70-NEXT:    shl.b32 %r79, %r4, 16;
 ; CHECK-SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r6, %rs4;
-; CHECK-SM70-NEXT:    shl.b32 %r7, %r6, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f2, %r7;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r7, %rs4;
+; CHECK-SM70-NEXT:    shl.b32 %r80, %r7, 16;
 ; CHECK-SM70-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r8, %rs6;
-; CHECK-SM70-NEXT:    shl.b32 %r9, %r8, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f3, %r9;
-; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
-; CHECK-SM70-NEXT:    mov.b32 %r10, %f4;
-; CHECK-SM70-NEXT:    bfe.u32 %r11, %r10, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r12, %r11, %r10;
-; CHECK-SM70-NEXT:    add.s32 %r13, %r12, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
-; CHECK-SM70-NEXT:    or.b32 %r14, %r10, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r15, %r14, %r13, %p1;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r15; }
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r16, %rs1;
-; CHECK-SM70-NEXT:    shl.b32 %r17, %r16, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f5, %r17;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r18, %rs3;
-; CHECK-SM70-NEXT:    shl.b32 %r19, %r18, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f6, %r19;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs5;
-; CHECK-SM70-NEXT:    shl.b32 %r21, %r20, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f7, %r21;
-; CHECK-SM70-NEXT:    fma.rn.f32 %f8, %f7, %f6, %f5;
-; CHECK-SM70-NEXT:    mov.b32 %r22, %f8;
-; CHECK-SM70-NEXT:    bfe.u32 %r23, %r22, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r24, %r23, %r22;
-; CHECK-SM70-NEXT:    add.s32 %r25, %r24, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f8, %f8;
-; CHECK-SM70-NEXT:    or.b32 %r26, %r22, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r27, %r26, %r25, %p2;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r27; }
-; CHECK-SM70-NEXT:    and.b32 %r28, %r15, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f9, %r28;
-; CHECK-SM70-NEXT:    setp.gt.f32 %p3, %f9, 0f00000000;
-; CHECK-SM70-NEXT:    and.b32 %r29, %r27, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f10, %r29;
-; CHECK-SM70-NEXT:    setp.gt.f32 %p4, %f10, 0f00000000;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r10, %rs6;
+; CHECK-SM70-NEXT:    shl.b32 %r81, %r10, 16;
+; CHECK-SM70-NEXT:    fma.rn.f32 %r82, %r81, %r80, %r79;
+; CHECK-SM70-NEXT:    bfe.u32 %r15, %r82, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r16, %r15, %r82;
+; CHECK-SM70-NEXT:    add.s32 %r17, %r16, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %r82, %r82;
+; CHECK-SM70-NEXT:    or.b32 %r18, %r82, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r19, %r18, %r17, %p1;
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r19; }
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs1;
+; CHECK-SM70-NEXT:    shl.b32 %r83, %r20, 16;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r23, %rs3;
+; CHECK-SM70-NEXT:    shl.b32 %r84, %r23, 16;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r26, %rs5;
+; CHECK-SM70-NEXT:    shl.b32 %r85, %r26, 16;
+; CHECK-SM70-NEXT:    fma.rn.f32 %r86, %r85, %r84, %r83;
+; CHECK-SM70-NEXT:    bfe.u32 %r31, %r86, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r32, %r31, %r86;
+; CHECK-SM70-NEXT:    add.s32 %r33, %r32, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %r86, %r86;
+; CHECK-SM70-NEXT:    or.b32 %r34, %r86, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r35, %r34, %r33, %p2;
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r35; }
+; CHECK-SM70-NEXT:    and.b32 %r87, %r19, -65536;
+; CHECK-SM70-NEXT:    setp.gt.f32 %p3, %r87, 0f00000000;
+; CHECK-SM70-NEXT:    and.b32 %r88, %r35, -65536;
+; CHECK-SM70-NEXT:    setp.gt.f32 %p4, %r88, 0f00000000;
 ; CHECK-SM70-NEXT:    selp.b16 %rs9, %rs8, 0x0000, %p4;
 ; CHECK-SM70-NEXT:    selp.b16 %rs10, %rs7, 0x0000, %p3;
-; CHECK-SM70-NEXT:    add.rn.f32 %f11, %f10, 0f40E00000;
-; CHECK-SM70-NEXT:    mov.b32 %r30, %f11;
-; CHECK-SM70-NEXT:    bfe.u32 %r31, %r30, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r32, %r31, %r30;
-; CHECK-SM70-NEXT:    add.s32 %r33, %r32, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p5, %f11, %f11;
-; CHECK-SM70-NEXT:    or.b32 %r34, %r30, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r35, %r34, %r33, %p5;
-; CHECK-SM70-NEXT:    add.rn.f32 %f12, %f9, 0f40E00000;
-; CHECK-SM70-NEXT:    mov.b32 %r36, %f12;
-; CHECK-SM70-NEXT:    bfe.u32 %r37, %r36, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r38, %r37, %r36;
-; CHECK-SM70-NEXT:    add.s32 %r39, %r38, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p6, %f12, %f12;
-; CHECK-SM70-NEXT:    or.b32 %r40, %r36, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r41, %r40, %r39, %p6;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r42, %rs10;
-; CHECK-SM70-NEXT:    shl.b32 %r43, %r42, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f13, %r43;
-; CHECK-SM70-NEXT:    and.b32 %r44, %r41, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f14, %r44;
-; CHECK-SM70-NEXT:    add.rn.f32 %f15, %f13, %f14;
-; CHECK-SM70-NEXT:    mov.b32 %r45, %f15;
-; CHECK-SM70-NEXT:    bfe.u32 %r46, %r45, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r47, %r46, %r45;
-; CHECK-SM70-NEXT:    add.s32 %r48, %r47, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p7, %f15, %f15;
-; CHECK-SM70-NEXT:    or.b32 %r49, %r45, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r50, %r49, %r48, %p7;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r51, %rs9;
-; CHECK-SM70-NEXT:    shl.b32 %r52, %r51, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f16, %r52;
-; CHECK-SM70-NEXT:    and.b32 %r53, %r35, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f17, %r53;
-; CHECK-SM70-NEXT:    add.rn.f32 %f18, %f16, %f17;
-; CHECK-SM70-NEXT:    mov.b32 %r54, %f18;
-; CHECK-SM70-NEXT:    bfe.u32 %r55, %r54, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r56, %r55, %r54;
-; CHECK-SM70-NEXT:    add.s32 %r57, %r56, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p8, %f18, %f18;
-; CHECK-SM70-NEXT:    or.b32 %r58, %r54, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r59, %r58, %r57, %p8;
-; CHECK-SM70-NEXT:    prmt.b32 %r60, %r59, %r50, 0x7632U;
-; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r60;
+; CHECK-SM70-NEXT:    add.rn.f32 %r89, %r88, 0f40E00000;
+; CHECK-SM70-NEXT:    bfe.u32 %r42, %r89, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r43, %r42, %r89;
+; CHECK-SM70-NEXT:    add.s32 %r44, %r43, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p5, %r89, %r89;
+; CHECK-SM70-NEXT:    or.b32 %r45, %r89, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r46, %r45, %r44, %p5;
+; CHECK-SM70-NEXT:    add.rn.f32 %r90, %r87, 0f40E00000;
+; CHECK-SM70-NEXT:    bfe.u32 %r49, %r90, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r50, %r49, %r90;
+; CHECK-SM70-NEXT:    add.s32 %r51, %r50, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p6, %r90, %r90;
+; CHECK-SM70-NEXT:    or.b32 %r52, %r90, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r53, %r52, %r51, %p6;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r54, %rs10;
+; CHECK-SM70-NEXT:    shl.b32 %r91, %r54, 16;
+; CHECK-SM70-NEXT:    and.b32 %r92, %r53, -65536;
+; CHECK-SM70-NEXT:    add.rn.f32 %r93, %r91, %r92;
+; CHECK-SM70-NEXT:    bfe.u32 %r61, %r93, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r62, %r61, %r93;
+; CHECK-SM70-NEXT:    add.s32 %r63, %r62, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p7, %r93, %r93;
+; CHECK-SM70-NEXT:    or.b32 %r64, %r93, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r65, %r64, %r63, %p7;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r66, %rs9;
+; CHECK-SM70-NEXT:    shl.b32 %r94, %r66, 16;
+; CHECK-SM70-NEXT:    and.b32 %r95, %r46, -65536;
+; CHECK-SM70-NEXT:    add.rn.f32 %r96, %r94, %r95;
+; CHECK-SM70-NEXT:    bfe.u32 %r73, %r96, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r74, %r73, %r96;
+; CHECK-SM70-NEXT:    add.s32 %r75, %r74, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p8, %r96, %r96;
+; CHECK-SM70-NEXT:    or.b32 %r76, %r96, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r77, %r76, %r75, %p8;
+; CHECK-SM70-NEXT:    prmt.b32 %r78, %r77, %r65, 0x7632U;
+; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r78;
 ; CHECK-SM70-NEXT:    ret;
   %1 = fmul fast <2 x bfloat> %a, %b
   %2 = fadd fast <2 x bfloat> %1, %c
@@ -883,8 +818,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_maxnum_no_nans(<2 x bfloat> %a, <2 x bf
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<5>;
 ; CHECK-SM70-NEXT:    .reg .b16 %rs<7>;
-; CHECK-SM70-NEXT:    .reg .b32 %r<43>;
-; CHECK-SM70-NEXT:    .reg .b32 %f<13>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<67>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
 ; CHECK-SM70-NEXT:    ld.param.b32 %r1, [fma_bf16x2_expanded_maxnum_no_nans_param_0];
@@ -892,63 +826,51 @@ define <2 x bfloat> @fma_bf16x2_expanded_maxnum_no_nans(<2 x bfloat> %a, <2 x bf
 ; CHECK-SM70-NEXT:    ld.param.b32 %r3, [fma_bf16x2_expanded_maxnum_no_nans_param_2];
 ; CHECK-SM70-NEXT:    mov.b32 {%rs1, %rs2}, %r3;
 ; CHECK-SM70-NEXT:    cvt.u32.u16 %r4, %rs1;
-; CHECK-SM70-NEXT:    shl.b32 %r5, %r4, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f1, %r5;
+; CHECK-SM70-NEXT:    shl.b32 %r55, %r4, 16;
 ; CHECK-SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r6, %rs3;
-; CHECK-SM70-NEXT:    shl.b32 %r7, %r6, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f2, %r7;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r7, %rs3;
+; CHECK-SM70-NEXT:    shl.b32 %r56, %r7, 16;
 ; CHECK-SM70-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r8, %rs5;
-; CHECK-SM70-NEXT:    shl.b32 %r9, %r8, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f3, %r9;
-; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
-; CHECK-SM70-NEXT:    mov.b32 %r10, %f4;
-; CHECK-SM70-NEXT:    bfe.u32 %r11, %r10, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r12, %r11, %r10;
-; CHECK-SM70-NEXT:    add.s32 %r13, %r12, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
-; CHECK-SM70-NEXT:    or.b32 %r14, %r10, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r15, %r14, %r13, %p1;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r16, %rs2;
-; CHECK-SM70-NEXT:    shl.b32 %r17, %r16, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f5, %r17;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r18, %rs4;
-; CHECK-SM70-NEXT:    shl.b32 %r19, %r18, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f6, %r19;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs6;
-; CHECK-SM70-NEXT:    shl.b32 %r21, %r20, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f7, %r21;
-; CHECK-SM70-NEXT:    fma.rn.f32 %f8, %f7, %f6, %f5;
-; CHECK-SM70-NEXT:    mov.b32 %r22, %f8;
-; CHECK-SM70-NEXT:    bfe.u32 %r23, %r22, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r24, %r23, %r22;
-; CHECK-SM70-NEXT:    add.s32 %r25, %r24, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f8, %f8;
-; CHECK-SM70-NEXT:    or.b32 %r26, %r22, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r27, %r26, %r25, %p2;
-; CHECK-SM70-NEXT:    and.b32 %r28, %r27, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f9, %r28;
-; CHECK-SM70-NEXT:    max.f32 %f10, %f9, 0f00000000;
-; CHECK-SM70-NEXT:    mov.b32 %r29, %f10;
-; CHECK-SM70-NEXT:    bfe.u32 %r30, %r29, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r31, %r30, %r29;
-; CHECK-SM70-NEXT:    add.s32 %r32, %r31, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p3, %f10, %f10;
-; CHECK-SM70-NEXT:    or.b32 %r33, %r29, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r34, %r33, %r32, %p3;
-; CHECK-SM70-NEXT:    and.b32 %r35, %r15, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f11, %r35;
-; CHECK-SM70-NEXT:    max.f32 %f12, %f11, 0f00000000;
-; CHECK-SM70-NEXT:    mov.b32 %r36, %f12;
-; CHECK-SM70-NEXT:    bfe.u32 %r37, %r36, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r38, %r37, %r36;
-; CHECK-SM70-NEXT:    add.s32 %r39, %r38, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p4, %f12, %f12;
-; CHECK-SM70-NEXT:    or.b32 %r40, %r36, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r41, %r40, %r39, %p4;
-; CHECK-SM70-NEXT:    prmt.b32 %r42, %r41, %r34, 0x7632U;
-; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r42;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r10, %rs5;
+; CHECK-SM70-NEXT:    shl.b32 %r57, %r10, 16;
+; CHECK-SM70-NEXT:    fma.rn.f32 %r58, %r57, %r56, %r55;
+; CHECK-SM70-NEXT:    bfe.u32 %r15, %r58, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r16, %r15, %r58;
+; CHECK-SM70-NEXT:    add.s32 %r17, %r16, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %r58, %r58;
+; CHECK-SM70-NEXT:    or.b32 %r18, %r58, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r19, %r18, %r17, %p1;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs2;
+; CHECK-SM70-NEXT:    shl.b32 %r59, %r20, 16;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r23, %rs4;
+; CHECK-SM70-NEXT:    shl.b32 %r60, %r23, 16;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r26, %rs6;
+; CHECK-SM70-NEXT:    shl.b32 %r61, %r26, 16;
+; CHECK-SM70-NEXT:    fma.rn.f32 %r62, %r61, %r60, %r59;
+; CHECK-SM70-NEXT:    bfe.u32 %r31, %r62, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r32, %r31, %r62;
+; CHECK-SM70-NEXT:    add.s32 %r33, %r32, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %r62, %r62;
+; CHECK-SM70-NEXT:    or.b32 %r34, %r62, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r35, %r34, %r33, %p2;
+; CHECK-SM70-NEXT:    and.b32 %r63, %r35, -65536;
+; CHECK-SM70-NEXT:    max.f32 %r64, %r63, 0f00000000;
+; CHECK-SM70-NEXT:    bfe.u32 %r40, %r64, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r41, %r40, %r64;
+; CHECK-SM70-NEXT:    add.s32 %r42, %r41, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p3, %r64, %r64;
+; CHECK-SM70-NEXT:    or.b32 %r43, %r64, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r44, %r43, %r42, %p3;
+; CHECK-SM70-NEXT:    and.b32 %r65, %r19, -65536;
+; CHECK-SM70-NEXT:    max.f32 %r66, %r65, 0f00000000;
+; CHECK-SM70-NEXT:    bfe.u32 %r49, %r66, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r50, %r49, %r66;
+; CHECK-SM70-NEXT:    add.s32 %r51, %r50, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p4, %r66, %r66;
+; CHECK-SM70-NEXT:    or.b32 %r52, %r66, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r53, %r52, %r51, %p4;
+; CHECK-SM70-NEXT:    prmt.b32 %r54, %r53, %r44, 0x7632U;
+; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r54;
 ; CHECK-SM70-NEXT:    ret;
   %1 = fmul fast <2 x bfloat> %a, %b
   %2 = fadd fast <2 x bfloat> %1, %c
@@ -1084,16 +1006,16 @@ define half @fma_f16_maxnum_no_nans(half %a, half %b, half %c)  {
 ; CHECK-SM70-LABEL: fma_f16_maxnum_no_nans(
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .b16 %rs<6>;
-; CHECK-SM70-NEXT:    .reg .b32 %f<3>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<3>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
 ; CHECK-SM70-NEXT:    ld.param.b16 %rs1, [fma_f16_maxnum_no_nans_param_0];
 ; CHECK-SM70-NEXT:    ld.param.b16 %rs2, [fma_f16_maxnum_no_nans_param_1];
 ; CHECK-SM70-NEXT:    ld.param.b16 %rs3, [fma_f16_maxnum_no_nans_param_2];
 ; CHECK-SM70-NEXT:    fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
-; CHECK-SM70-NEXT:    cvt.f32.f16 %f1, %rs4;
-; CHECK-SM70-NEXT:    max.f32 %f2, %f1, 0f00000000;
-; CHECK-SM70-NEXT:    cvt.rn.f16.f32 %rs5, %f2;
+; CHECK-SM70-NEXT:    cvt.f32.f16 %r1, %rs4;
+; CHECK-SM70-NEXT:    max.f32 %r2, %r1, 0f00000000;
+; CHECK-SM70-NEXT:    cvt.rn.f16.f32 %rs5, %r2;
 ; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs5;
 ; CHECK-SM70-NEXT:    ret;
   %1 = call nnan half @llvm.fma.f16(half %a, half %b, half %c)
@@ -1130,31 +1052,25 @@ define bfloat @fma_bf16_no_nans(bfloat %a, bfloat %b, bfloat %c)  {
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<3>;
 ; CHECK-SM70-NEXT:    .reg .b16 %rs<3>;
-; CHECK-SM70-NEXT:    .reg .b32 %r<14>;
-; CHECK-SM70-NEXT:    .reg .b32 %f<6>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<24>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
 ; CHECK-SM70-NEXT:    ld.param.b16 %r1, [fma_bf16_no_nans_param_2];
-; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
-; CHECK-SM70-NEXT:    ld.param.b16 %r3, [fma_bf16_no_nans_param_1];
-; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
-; CHECK-SM70-NEXT:    ld.param.b16 %r5, [fma_bf16_no_nans_param_0];
-; CHECK-SM70-NEXT:    shl.b32 %r6, %r5, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f3, %r6;
-; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
-; CHECK-SM70-NEXT:    mov.b32 %r7, %f4;
-; CHECK-SM70-NEXT:    bfe.u32 %r8, %r7, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r9, %r8, %r7;
-; CHECK-SM70-NEXT:    add.s32 %r10, %r9, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
-; CHECK-SM70-NEXT:    or.b32 %r11, %r7, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r12; }
-; CHECK-SM70-NEXT:    and.b32 %r13, %r12, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f5, %r13;
-; CHECK-SM70-NEXT:    setp.gt.f32 %p2, %f5, 0f00000000;
+; CHECK-SM70-NEXT:    shl.b32 %r19, %r1, 16;
+; CHECK-SM70-NEXT:    ld.param.b16 %r4, [fma_bf16_no_nans_param_1];
+; CHECK-SM70-NEXT:    shl.b32 %r20, %r4, 16;
+; CHECK-SM70-NEXT:    ld.param.b16 %r7, [fma_bf16_no_nans_param_0];
+; CHECK-SM70-NEXT:    shl.b32 %r21, %r7, 16;
+; CHECK-SM70-NEXT:    fma.rn.f32 %r22, %r21, %r20, %r19;
+; CHECK-SM70-NEXT:    bfe.u32 %r12, %r22, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r13, %r12, %r22;
+; CHECK-SM70-NEXT:    add.s32 %r14, %r13, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %r22, %r22;
+; CHECK-SM70-NEXT:    or.b32 %r15, %r22, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r16, %r15, %r14, %p1;
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r16; }
+; CHECK-SM70-NEXT:    and.b32 %r23, %r16, -65536;
+; CHECK-SM70-NEXT:    setp.gt.f32 %p2, %r23, 0f00000000;
 ; CHECK-SM70-NEXT:    selp.b16 %rs2, %rs1, 0x0000, %p2;
 ; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs2;
 ; CHECK-SM70-NEXT:    ret;
@@ -1185,8 +1101,7 @@ define bfloat @fma_bf16_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloa
 ; CHECK-FTZ-LABEL: fma_bf16_no_nans_multiple_uses_of_fma(
 ; CHECK-FTZ:       {
 ; CHECK-FTZ-NEXT:    .reg .b16 %rs<7>;
-; CHECK-FTZ-NEXT:    .reg .b32 %r<5>;
-; CHECK-FTZ-NEXT:    .reg .b32 %f<5>;
+; CHECK-FTZ-NEXT:    .reg .b32 %r<11>;
 ; CHECK-FTZ-EMPTY:
 ; CHECK-FTZ-NEXT:  // %bb.0:
 ; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_bf16_no_nans_multiple_uses_of_fma_param_0];
@@ -1194,15 +1109,13 @@ define bfloat @fma_bf16_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloa
 ; CHECK-FTZ-NEXT:    ld.param.b16 %rs3, [fma_bf16_no_nans_multiple_uses_of_fma_param_2];
 ; CHECK-FTZ-NEXT:    fma.rn.bf16 %rs4, %rs1, %rs2, %rs3;
 ; CHECK-FTZ-NEXT:    cvt.u32.u16 %r1, %rs4;
-; CHECK-FTZ-NEXT:    shl.b32 %r2, %r1, 16;
-; CHECK-FTZ-NEXT:    mov.b32 %f1, %r2;
-; CHECK-FTZ-NEXT:    add.rn.ftz.f32 %f2, %f1, 0f40E00000;
-; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs5, %f2;
-; CHECK-FTZ-NEXT:    cvt.u32.u16 %r3, %rs5;
-; CHECK-FTZ-NEXT:    shl.b32 %r4, %r3, 16;
-; CHECK-FTZ-NEXT:    mov.b32 %f3, %r4;
-; CHECK-FTZ-NEXT:    add.rn.ftz.f32 %f4, %f3, %f1;
-; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs6, %f4;
+; CHECK-FTZ-NEXT:    shl.b32 %r9, %r1, 16;
+; CHECK-FTZ-NEXT:    add.rn.ftz.f32 %r4, %r9, 0f40E00000;
+; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs5, %r4;
+; CHECK-FTZ-NEXT:    cvt.u32.u16 %r5, %rs5;
+; CHECK-FTZ-NEXT:    shl.b32 %r10, %r5, 16;
+; CHECK-FTZ-NEXT:    add.rn.ftz.f32 %r8, %r10, %r9;
+; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs6, %r8;
 ; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs6;
 ; CHECK-FTZ-NEXT:    ret;
 ;
@@ -1210,48 +1123,39 @@ define bfloat @fma_bf16_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloa
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<4>;
 ; CHECK-SM70-NEXT:    .reg .b16 %rs<2>;
-; CHECK-SM70-NEXT:    .reg .b32 %r<27>;
-; CHECK-SM70-NEXT:    .reg .b32 %f<9>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<43>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
 ; CHECK-SM70-NEXT:    ld.param.b16 %r1, [fma_bf16_no_nans_multiple_uses_of_fma_param_2];
-; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
-; CHECK-SM70-NEXT:    ld.param.b16 %r3, [fma_bf16_no_nans_multiple_uses_of_fma_param_1];
-; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
-; CHECK-SM70-NEXT:    ld.param.b16 %r5, [fma_bf16_no_nans_multiple_uses_of_fma_param_0];
-; CHECK-SM70-NEXT:    shl.b32 %r6, %r5, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f3, %r6;
-; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
-; CHECK-SM70-NEXT:    mov.b32 %r7, %f4;
-; CHECK-SM70-NEXT:    bfe.u32 %r8, %r7, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r9, %r8, %r7;
-; CHECK-SM70-NEXT:    add.s32 %r10, %r9, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
-; CHECK-SM70-NEXT:    or.b32 %r11, %r7, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
-; CHECK-SM70-NEXT:    and.b32 %r13, %r12, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f5, %r13;
-; CHECK-SM70-NEXT:    add.rn.f32 %f6, %f5, 0f40E00000;
-; CHECK-SM70-NEXT:    mov.b32 %r14, %f6;
-; CHECK-SM70-NEXT:    bfe.u32 %r15, %r14, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r16, %r15, %r14;
-; CHECK-SM70-NEXT:    add.s32 %r17, %r16, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f6, %f6;
-; CHECK-SM70-NEXT:    or.b32 %r18, %r14, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r19, %r18, %r17, %p2;
-; CHECK-SM70-NEXT:    and.b32 %r20, %r19, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f7, %r20;
-; CHECK-SM70-NEXT:    add.rn.f32 %f8, %f7, %f5;
-; CHECK-SM70-NEXT:    mov.b32 %r21, %f8;
-; CHECK-SM70-NEXT:    bfe.u32 %r22, %r21, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r23, %r22, %r21;
-; CHECK-SM70-NEXT:    add.s32 %r24, %r23, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p3, %f8, %f8;
-; CHECK-SM70-NEXT:    or.b32 %r25, %r21, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r26, %r25, %r24, %p3;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r26; }
+; CHECK-SM70-NEXT:    shl.b32 %r35, %r1, 16;
+; CHECK-SM70-NEXT:    ld.param.b16 %r4, [fma_bf16_no_nans_multiple_uses_of_fma_param_1];
+; CHECK-SM70-NEXT:    shl.b32 %r36, %r4, 16;
+; CHECK-SM70-NEXT:    ld.param.b16 %r7, [fma_bf16_no_nans_multiple_uses_of_fma_param_0];
+; CHECK-SM70-NEXT:    shl.b32 %r37, %r7, 16;
+; CHECK-SM70-NEXT:    fma.rn.f32 %r38, %r37, %r36, %r35;
+; CHECK-SM70-NEXT:    bfe.u32 %r12, %r38, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r13, %r12, %r38;
+; CHECK-SM70-NEXT:    add.s32 %r14, %r13, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %r38, %r38;
+; CHECK-SM70-NEXT:    or.b32 %r15, %r38, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r16, %r15, %r14, %p1;
+; CHECK-SM70-NEXT:    and.b32 %r39, %r16, -65536;
+; CHECK-SM70-NEXT:    add.rn.f32 %r40, %r39, 0f40E00000;
+; CHECK-SM70-NEXT:    bfe.u32 %r21, %r40, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r22, %r21, %r40;
+; CHECK-SM70-NEXT:    add.s32 %r23, %r22, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %r40, %r40;
+; CHECK-SM70-NEXT:    or.b32 %r24, %r40, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r25, %r24, %r23, %p2;
+; CHECK-SM70-NEXT:    and.b32 %r41, %r25, -65536;
+; CHECK-SM70-NEXT:    add.rn.f32 %r42, %r41, %r39;
+; CHECK-SM70-NEXT:    bfe.u32 %r30, %r42, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r31, %r30, %r42;
+; CHECK-SM70-NEXT:    add.s32 %r32, %r31, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p3, %r42, %r42;
+; CHECK-SM70-NEXT:    or.b32 %r33, %r42, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r34, %r33, %r32, %p3;
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r34; }
 ; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; CHECK-SM70-NEXT:    ret;
   %1 = call nnan bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c)
@@ -1291,38 +1195,31 @@ define bfloat @fma_bf16_maxnum_no_nans(bfloat %a, bfloat %b, bfloat %c)  {
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<3>;
 ; CHECK-SM70-NEXT:    .reg .b16 %rs<2>;
-; CHECK-SM70-NEXT:    .reg .b32 %r<20>;
-; CHECK-SM70-NEXT:    .reg .b32 %f<7>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<32>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
 ; CHECK-SM70-NEXT:    ld.param.b16 %r1, [fma_bf16_maxnum_no_nans_param_2];
-; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
-; CHECK-SM70-NEXT:    ld.param.b16 %r3, [fma_bf16_maxnum_no_nans_param_1];
-; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
-; CHECK-SM70-NEXT:    ld.param.b16 %r5, [fma_bf16_maxnum_no_nans_param_0];
-; CHECK-SM70-NEXT:    shl.b32 %r6, %r5, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f3, %r6;
-; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
-; CHECK-SM70-NEXT:    mov.b32 %r7, %f4;
-; CHECK-SM70-NEXT:    bfe.u32 %r8, %r7, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r9, %r8, %r7;
-; CHECK-SM70-NEXT:    add.s32 %r10, %r9, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
-; CHECK-SM70-NEXT:    or.b32 %r11, %r7, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
-; CHECK-SM70-NEXT:    and.b32 %r13, %r12, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f5, %r13;
-; CHECK-SM70-NEXT:    max.f32 %f6, %f5, 0f00000000;
-; CHECK-SM70-NEXT:    mov.b32 %r14, %f6;
-; CHECK-SM70-NEXT:    bfe.u32 %r15, %r14, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r16, %r15, %r14;
-; CHECK-SM70-NEXT:    add.s32 %r17, %r16, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f6, %f6;
-; CHECK-SM70-NEXT:    or.b32 %r18, %r14, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r19, %r18, %r17, %p2;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r19; }
+; CHECK-SM70-NEXT:    shl.b32 %r26, %r1, 16;
+; CHECK-SM70-NEXT:    ld.param.b16 %r4, [fma_bf16_maxnum_no_nans_param_1];
+; CHECK-SM70-NEXT:    shl.b32 %r27, %r4, 16;
+; CHECK-SM70-NEXT:    ld.param.b16 %r7, [fma_bf16_maxnum_no_nans_param_0];
+; CHECK-SM70-NEXT:    shl.b32 %r28, %r7, 16;
+; CHECK-SM70-NEXT:    fma.rn.f32 %r29, %r28, %r27, %r26;
+; CHECK-SM70-NEXT:    bfe.u32 %r12, %r29, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r13, %r12, %r29;
+; CHECK-SM70-NEXT:    add.s32 %r14, %r13, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %r29, %r29;
+; CHECK-SM70-NEXT:    or.b32 %r15, %r29, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r16, %r15, %r14, %p1;
+; CHECK-SM70-NEXT:    and.b32 %r30, %r16, -65536;
+; CHECK-SM70-NEXT:    max.f32 %r31, %r30, 0f00000000;
+; CHECK-SM70-NEXT:    bfe.u32 %r21, %r31, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r22, %r21, %r31;
+; CHECK-SM70-NEXT:    add.s32 %r23, %r22, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %r31, %r31;
+; CHECK-SM70-NEXT:    or.b32 %r24, %r31, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r25, %r24, %r23, %p2;
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r25; }
 ; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; CHECK-SM70-NEXT:    ret;
   %1 = call nnan bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c)
@@ -1466,8 +1363,7 @@ define <2 x half> @fma_f16x2_maxnum_no_nans(<2 x half> %a, <2 x half> %b, <2 x h
 ; CHECK-SM70-LABEL: fma_f16x2_maxnum_no_nans(
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .b16 %rs<5>;
-; CHECK-SM70-NEXT:    .reg .b32 %r<6>;
-; CHECK-SM70-NEXT:    .reg .b32 %f<5>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<10>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
 ; CHECK-SM70-NEXT:    ld.param.b32 %r1, [fma_f16x2_maxnum_no_nans_param_2];
@@ -1475,14 +1371,14 @@ define <2 x half> @fma_f16x2_maxnum_no_nans(<2 x half> %a, <2 x half> %b, <2 x h
 ; CHECK-SM70-NEXT:    ld.param.b32 %r3, [fma_f16x2_maxnum_no_nans_param_0];
 ; CHECK-SM70-NEXT:    fma.rn.f16x2 %r4, %r3, %r2, %r1;
 ; CHECK-SM70-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
-; CHECK-SM70-NEXT:    cvt.f32.f16 %f1, %rs2;
-; CHECK-SM70-NEXT:    max.f32 %f2, %f1, 0f00000000;
-; CHECK-SM70-NEXT:    cvt.rn.f16.f32 %rs3, %f2;
-; CHECK-SM70-NEXT:    cvt.f32.f16 %f3, %rs1;
-; CHECK-SM70-NEXT:    max.f32 %f4, %f3, 0f00000000;
-; CHECK-SM70-NEXT:    cvt.rn.f16.f32 %rs4, %f4;
-; CHECK-SM70-NEXT:    mov.b32 %r5, {%rs4, %rs3};
-; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r5;
+; CHECK-SM70-NEXT:    cvt.f32.f16 %r5, %rs2;
+; CHECK-SM70-NEXT:    max.f32 %r6, %r5, 0f00000000;
+; CHECK-SM70-NEXT:    cvt.rn.f16.f32 %rs3, %r6;
+; CHECK-SM70-NEXT:    cvt.f32.f16 %r7, %rs1;
+; CHECK-SM70-NEXT:    max.f32 %r8, %r7, 0f00000000;
+; CHECK-SM70-NEXT:    cvt.rn.f16.f32 %rs4, %r8;
+; CHECK-SM70-NEXT:    mov.b32 %r9, {%rs4, %rs3};
+; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r9;
 ; CHECK-SM70-NEXT:    ret;
   %1 = call nnan <2 x half> @llvm.fma.f16x2(<2 x half> %a, <2 x half> %b, <2 x half> %c)
   %2 = call nsz <2 x half> @llvm.maxnum.f16x2(<2 x half> %1, <2 x half> <half 0.0, half 0.0>)
@@ -1518,8 +1414,7 @@ define <2 x bfloat> @fma_bf16x2_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x b
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<5>;
 ; CHECK-SM70-NEXT:    .reg .b16 %rs<11>;
-; CHECK-SM70-NEXT:    .reg .b32 %r<31>;
-; CHECK-SM70-NEXT:    .reg .b32 %f<11>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<51>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
 ; CHECK-SM70-NEXT:    ld.param.b32 %r1, [fma_bf16x2_no_nans_param_0];
@@ -1527,53 +1422,43 @@ define <2 x bfloat> @fma_bf16x2_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x b
 ; CHECK-SM70-NEXT:    ld.param.b32 %r3, [fma_bf16x2_no_nans_param_2];
 ; CHECK-SM70-NEXT:    mov.b32 {%rs1, %rs2}, %r3;
 ; CHECK-SM70-NEXT:    cvt.u32.u16 %r4, %rs1;
-; CHECK-SM70-NEXT:    shl.b32 %r5, %r4, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f1, %r5;
+; CHECK-SM70-NEXT:    shl.b32 %r41, %r4, 16;
 ; CHECK-SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r6, %rs3;
-; CHECK-SM70-NEXT:    shl.b32 %r7, %r6, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f2, %r7;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r7, %rs3;
+; CHECK-SM70-NEXT:    shl.b32 %r42, %r7, 16;
 ; CHECK-SM70-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r8, %rs5;
-; CHECK-SM70-NEXT:    shl.b32 %r9, %r8, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f3, %r9;
-; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
-; CHECK-SM70-NEXT:    mov.b32 %r10, %f4;
-; CHECK-SM70-NEXT:    bfe.u32 %r11, %r10, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r12, %r11, %r10;
-; CHECK-SM70-NEXT:    add.s32 %r13, %r12, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
-; CHECK-SM70-NEXT:    or.b32 %r14, %r10, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r15, %r14, %r13, %p1;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r15; }
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r16, %rs2;
-; CHECK-SM70-NEXT:    shl.b32 %r17, %r16, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f5, %r17;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r18, %rs4;
-; CHECK-SM70-NEXT:    shl.b32 %r19, %r18, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f6, %r19;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs6;
-; CHECK-SM70-NEXT:    shl.b32 %r21, %r20, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f7, %r21;
-; CHECK-SM70-NEXT:    fma.rn.f32 %f8, %f7, %f6, %f5;
-; CHECK-SM70-NEXT:    mov.b32 %r22, %f8;
-; CHECK-SM70-NEXT:    bfe.u32 %r23, %r22, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r24, %r23, %r22;
-; CHECK-SM70-NEXT:    add.s32 %r25, %r24, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f8, %f8;
-; CHECK-SM70-NEXT:    or.b32 %r26, %r22, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r27, %r26, %r25, %p2;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r27; }
-; CHECK-SM70-NEXT:    and.b32 %r28, %r15, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f9, %r28;
-; CHECK-SM70-NEXT:    setp.gt.f32 %p3, %f9, 0f00000000;
-; CHECK-SM70-NEXT:    and.b32 %r29, %r27, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f10, %r29;
-; CHECK-SM70-NEXT:    setp.gt.f32 %p4, %f10, 0f00000000;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r10, %rs5;
+; CHECK-SM70-NEXT:    shl.b32 %r43, %r10, 16;
+; CHECK-SM70-NEXT:    fma.rn.f32 %r44, %r43, %r42, %r41;
+; CHECK-SM70-NEXT:    bfe.u32 %r15, %r44, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r16, %r15, %r44;
+; CHECK-SM70-NEXT:    add.s32 %r17, %r16, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %r44, %r44;
+; CHECK-SM70-NEXT:    or.b32 %r18, %r44, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r19, %r18, %r17, %p1;
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r19; }
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs2;
+; CHECK-SM70-NEXT:    shl.b32 %r45, %r20, 16;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r23, %rs4;
+; CHECK-SM70-NEXT:    shl.b32 %r46, %r23, 16;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r26, %rs6;
+; CHECK-SM70-NEXT:    shl.b32 %r47, %r26, 16;
+; CHECK-SM70-NEXT:    fma.rn.f32 %r48, %r47, %r46, %r45;
+; CHECK-SM70-NEXT:    bfe.u32 %r31, %r48, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r32, %r31, %r48;
+; CHECK-SM70-NEXT:    add.s32 %r33, %r32, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %r48, %r48;
+; CHECK-SM70-NEXT:    or.b32 %r34, %r48, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r35, %r34, %r33, %p2;
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r35; }
+; CHECK-SM70-NEXT:    and.b32 %r49, %r19, -65536;
+; CHECK-SM70-NEXT:    setp.gt.f32 %p3, %r49, 0f00000000;
+; CHECK-SM70-NEXT:    and.b32 %r50, %r35, -65536;
+; CHECK-SM70-NEXT:    setp.gt.f32 %p4, %r50, 0f00000000;
 ; CHECK-SM70-NEXT:    selp.b16 %rs9, %rs8, 0x0000, %p4;
 ; CHECK-SM70-NEXT:    selp.b16 %rs10, %rs7, 0x0000, %p3;
-; CHECK-SM70-NEXT:    mov.b32 %r30, {%rs10, %rs9};
-; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r30;
+; CHECK-SM70-NEXT:    mov.b32 %r40, {%rs10, %rs9};
+; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r40;
 ; CHECK-SM70-NEXT:    ret;
   %1 = call nnan <2 x bfloat> @llvm.fma.bf16x2(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
   %2 = fcmp nsz ogt <2 x bfloat> %1, <bfloat 0.0, bfloat 0.0>
@@ -1602,8 +1487,7 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2
 ; CHECK-FTZ-LABEL: fma_bf16x2_no_nans_multiple_uses_of_fma(
 ; CHECK-FTZ:       {
 ; CHECK-FTZ-NEXT:    .reg .b16 %rs<5>;
-; CHECK-FTZ-NEXT:    .reg .b32 %r<14>;
-; CHECK-FTZ-NEXT:    .reg .b32 %f<9>;
+; CHECK-FTZ-NEXT:    .reg .b32 %r<26>;
 ; CHECK-FTZ-EMPTY:
 ; CHECK-FTZ-NEXT:  // %bb.0:
 ; CHECK-FTZ-NEXT:    ld.param.b32 %r1, [fma_bf16x2_no_nans_multiple_uses_of_fma_param_2];
@@ -1612,33 +1496,28 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2
 ; CHECK-FTZ-NEXT:    fma.rn.bf16x2 %r4, %r3, %r2, %r1;
 ; CHECK-FTZ-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
 ; CHECK-FTZ-NEXT:    cvt.u32.u16 %r5, %rs2;
-; CHECK-FTZ-NEXT:    shl.b32 %r6, %r5, 16;
-; CHECK-FTZ-NEXT:    mov.b32 %f1, %r6;
-; CHECK-FTZ-NEXT:    add.rn.ftz.f32 %f2, %f1, 0f40E00000;
-; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs3, %f2;
-; CHECK-FTZ-NEXT:    cvt.u32.u16 %r7, %rs1;
-; CHECK-FTZ-NEXT:    shl.b32 %r8, %r7, 16;
-; CHECK-FTZ-NEXT:    mov.b32 %f3, %r8;
-; CHECK-FTZ-NEXT:    add.rn.ftz.f32 %f4, %f3, 0f40E00000;
-; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs4, %f4;
-; CHECK-FTZ-NEXT:    cvt.u32.u16 %r9, %rs4;
-; CHECK-FTZ-NEXT:    shl.b32 %r10, %r9, 16;
-; CHECK-FTZ-NEXT:    mov.b32 %f5, %r10;
-; CHECK-FTZ-NEXT:    add.rn.ftz.f32 %f6, %f5, %f3;
-; CHECK-FTZ-NEXT:    cvt.u32.u16 %r11, %rs3;
-; CHECK-FTZ-NEXT:    shl.b32 %r12, %r11, 16;
-; CHECK-FTZ-NEXT:    mov.b32 %f7, %r12;
-; CHECK-FTZ-NEXT:    add.rn.ftz.f32 %f8, %f7, %f1;
-; CHECK-FTZ-NEXT:    cvt.rn.bf16x2.f32 %r13, %f8, %f6;
-; CHECK-FTZ-NEXT:    st.param.b32 [func_retval0], %r13;
+; CHECK-FTZ-NEXT:    shl.b32 %r22, %r5, 16;
+; CHECK-FTZ-NEXT:    add.rn.ftz.f32 %r8, %r22, 0f40E00000;
+; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs3, %r8;
+; CHECK-FTZ-NEXT:    cvt.u32.u16 %r9, %rs1;
+; CHECK-FTZ-NEXT:    shl.b32 %r23, %r9, 16;
+; CHECK-FTZ-NEXT:    add.rn.ftz.f32 %r12, %r23, 0f40E00000;
+; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs4, %r12;
+; CHECK-FTZ-NEXT:    cvt.u32.u16 %r13, %rs4;
+; CHECK-FTZ-NEXT:    shl.b32 %r24, %r13, 16;
+; CHECK-FTZ-NEXT:    add.rn.ftz.f32 %r16, %r24, %r23;
+; CHECK-FTZ-NEXT:    cvt.u32.u16 %r17, %rs3;
+; CHECK-FTZ-NEXT:    shl.b32 %r25, %r17, 16;
+; CHECK-FTZ-NEXT:    add.rn.ftz.f32 %r20, %r25, %r22;
+; CHECK-FTZ-NEXT:    cvt.rn.bf16x2.f32 %r21, %r20, %r16;
+; CHECK-FTZ-NEXT:    st.param.b32 [func_retval0], %r21;
 ; CHECK-FTZ-NEXT:    ret;
 ;
 ; CHECK-SM70-LABEL: fma_bf16x2_no_nans_multiple_uses_of_fma(
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<7>;
 ; CHECK-SM70-NEXT:    .reg .b16 %rs<7>;
-; CHECK-SM70-NEXT:    .reg .b32 %r<57>;
-; CHECK-SM70-NEXT:    .reg .b32 %f<17>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<89>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
 ; CHECK-SM70-NEXT:    ld.param.b32 %r1, [fma_bf16x2_no_nans_multiple_uses_of_fma_param_0];
@@ -1646,83 +1525,67 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2
 ; CHECK-SM70-NEXT:    ld.param.b32 %r3, [fma_bf16x2_no_nans_multiple_uses_of_fma_param_2];
 ; CHECK-SM70-NEXT:    mov.b32 {%rs1, %rs2}, %r3;
 ; CHECK-SM70-NEXT:    cvt.u32.u16 %r4, %rs2;
-; CHECK-SM70-NEXT:    shl.b32 %r5, %r4, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f1, %r5;
+; CHECK-SM70-NEXT:    shl.b32 %r73, %r4, 16;
 ; CHECK-SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r6, %rs4;
-; CHECK-SM70-NEXT:    shl.b32 %r7, %r6, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f2, %r7;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r7, %rs4;
+; CHECK-SM70-NEXT:    shl.b32 %r74, %r7, 16;
 ; CHECK-SM70-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r8, %rs6;
-; CHECK-SM70-NEXT:    shl.b32 %r9, %r8, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f3, %r9;
-; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
-; CHECK-SM70-NEXT:    mov.b32 %r10, %f4;
-; CHECK-SM70-NEXT:    bfe.u32 %r11, %r10, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r12, %r11, %r10;
-; CHECK-SM70-NEXT:    add.s32 %r13, %r12, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
-; CHECK-SM70-NEXT:    or.b32 %r14, %r10, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r15, %r14, %r13, %p1;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r16, %rs1;
-; CHECK-SM70-NEXT:    shl.b32 %r17, %r16, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f5, %r17;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r18, %rs3;
-; CHECK-SM70-NEXT:    shl.b32 %r19, %r18, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f6, %r19;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs5;
-; CHECK-SM70-NEXT:    shl.b32 %r21, %r20, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f7, %r21;
-; CHECK-SM70-NEXT:    fma.rn.f32 %f8, %f7, %f6, %f5;
-; CHECK-SM70-NEXT:    mov.b32 %r22, %f8;
-; CHECK-SM70-NEXT:    bfe.u32 %r23, %r22, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r24, %r23, %r22;
-; CHECK-SM70-NEXT:    add.s32 %r25, %r24, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f8, %f8;
-; CHECK-SM70-NEXT:    or.b32 %r26, %r22, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r27, %r26, %r25, %p2;
-; CHECK-SM70-NEXT:    and.b32 %r28, %r27, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f9, %r28;
-; CHECK-SM70-NEXT:    add.rn.f32 %f10, %f9, 0f40E00000;
-; CHECK-SM70-NEXT:    mov.b32 %r29, %f10;
-; CHECK-SM70-NEXT:    bfe.u32 %r30, %r29, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r31, %r30, %r29;
-; CHECK-SM70-NEXT:    add.s32 %r32, %r31, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p3, %f10, %f10;
-; CHECK-SM70-NEXT:    or.b32 %r33, %r29, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r34, %r33, %r32, %p3;
-; CHECK-SM70-NEXT:    and.b32 %r35, %r15, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f11, %r35;
-; CHECK-SM70-NEXT:    add.rn.f32 %f12, %f11, 0f40E00000;
-; CHECK-SM70-NEXT:    mov.b32 %r36, %f12;
-; CHECK-SM70-NEXT:    bfe.u32 %r37, %r36, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r38, %r37, %r36;
-; CHECK-SM70-NEXT:    add.s32 %r39, %r38, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p4, %f12, %f12;
-; CHECK-SM70-NEXT:    or.b32 %r40, %r36, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r41, %r40, %r39, %p4;
-; CHECK-SM70-NEXT:    and.b32 %r42, %r41, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f13, %r42;
-; CHECK-SM70-NEXT:    add.rn.f32 %f14, %f13, %f11;
-; CHECK-SM70-NEXT:    mov.b32 %r43, %f14;
-; CHECK-SM70-NEXT:    bfe.u32 %r44, %r43, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r45, %r44, %r43;
-; CHECK-SM70-NEXT:    add.s32 %r46, %r45, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p5, %f14, %f14;
-; CHECK-SM70-NEXT:    or.b32 %r47, %r43, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r48, %r47, %r46, %p5;
-; CHECK-SM70-NEXT:    and.b32 %r49, %r34, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f15, %r49;
-; CHECK-SM70-NEXT:    add.rn.f32 %f16, %f15, %f9;
-; CHECK-SM70-NEXT:    mov.b32 %r50, %f16;
-; CHECK-SM70-NEXT:    bfe.u32 %r51, %r50, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r52, %r51, %r50;
-; CHECK-SM70-NEXT:    add.s32 %r53, %r52, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p6, %f16, %f16;
-; CHECK-SM70-NEXT:    or.b32 %r54, %r50, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r55, %r54, %r53, %p6;
-; CHECK-SM70-NEXT:    prmt.b32 %r56, %r55, %r48, 0x7632U;
-; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r56;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r10, %rs6;
+; CHECK-SM70-NEXT:    shl.b32 %r75, %r10, 16;
+; CHECK-SM70-NEXT:    fma.rn.f32 %r76, %r75, %r74, %r73;
+; CHECK-SM70-NEXT:    bfe.u32 %r15, %r76, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r16, %r15, %r76;
+; CHECK-SM70-NEXT:    add.s32 %r17, %r16, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %r76, %r76;
+; CHECK-SM70-NEXT:    or.b32 %r18, %r76, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r19, %r18, %r17, %p1;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs1;
+; CHECK-SM70-NEXT:    shl.b32 %r77, %r20, 16;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r23, %rs3;
+; CHECK-SM70-NEXT:    shl.b32 %r78, %r23, 16;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r26, %rs5;
+; CHECK-SM70-NEXT:    shl.b32 %r79, %r26, 16;
+; CHECK-SM70-NEXT:    fma.rn.f32 %r80, %r79, %r78, %r77;
+; CHECK-SM70-NEXT:    bfe.u32 %r31, %r80, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r32, %r31, %r80;
+; CHECK-SM70-NEXT:    add.s32 %r33, %r32, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %r80, %r80;
+; CHECK-SM70-NEXT:    or.b32 %r34, %r80, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r35, %r34, %r33, %p2;
+; CHECK-SM70-NEXT:    and.b32 %r81, %r35, -65536;
+; CHECK-SM70-NEXT:    add.rn.f32 %r82, %r81, 0f40E00000;
+; CHECK-SM70-NEXT:    bfe.u32 %r40, %r82, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r41, %r40, %r82;
+; CHECK-SM70-NEXT:    add.s32 %r42, %r41, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p3, %r82, %r82;
+; CHECK-SM70-NEXT:    or.b32 %r43, %r82, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r44, %r43, %r42, %p3;
+; CHECK-SM70-NEXT:    and.b32 %r83, %r19, -65536;
+; CHECK-SM70-NEXT:    add.rn.f32 %r84, %r83, 0f40E00000;
+; CHECK-SM70-NEXT:    bfe.u32 %r49, %r84, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r50, %r49, %r84;
+; CHECK-SM70-NEXT:    add.s32 %r51, %r50, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p4, %r84, %r84;
+; CHECK-SM70-NEXT:    or.b32 %r52, %r84, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r53, %r52, %r51, %p4;
+; CHECK-SM70-NEXT:    and.b32 %r85, %r53, -65536;
+; CHECK-SM70-NEXT:    add.rn.f32 %r86, %r85, %r83;
+; CHECK-SM70-NEXT:    bfe.u32 %r58, %r86, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r59, %r58, %r86;
+; CHECK-SM70-NEXT:    add.s32 %r60, %r59, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p5, %r86, %r86;
+; CHECK-SM70-NEXT:    or.b32 %r61, %r86, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r62, %r61, %r60, %p5;
+; CHECK-SM70-NEXT:    and.b32 %r87, %r44, -65536;
+; CHECK-SM70-NEXT:    add.rn.f32 %r88, %r87, %r81;
+; CHECK-SM70-NEXT:    bfe.u32 %r67, %r88, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r68, %r67, %r88;
+; CHECK-SM70-NEXT:    add.s32 %r69, %r68, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p6, %r88, %r88;
+; CHECK-SM70-NEXT:    or.b32 %r70, %r88, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r71, %r70, %r69, %p6;
+; CHECK-SM70-NEXT:    prmt.b32 %r72, %r71, %r62, 0x7632U;
+; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r72;
 ; CHECK-SM70-NEXT:    ret;
   %1 = call nnan <2 x bfloat> @llvm.fma.bf16x2(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
   %2 = fcmp nsz ogt <2 x bfloat> %1, <bfloat 0.0, bfloat 0.0>
@@ -1761,8 +1624,7 @@ define <2 x bfloat> @fma_bf16x2_maxnum_no_nans(<2 x bfloat> %a, <2 x bfloat> %b,
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<5>;
 ; CHECK-SM70-NEXT:    .reg .b16 %rs<7>;
-; CHECK-SM70-NEXT:    .reg .b32 %r<43>;
-; CHECK-SM70-NEXT:    .reg .b32 %f<13>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<67>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
 ; CHECK-SM70-NEXT:    ld.param.b32 %r1, [fma_bf16x2_maxnum_no_nans_param_0];
@@ -1770,63 +1632,51 @@ define <2 x bfloat> @fma_bf16x2_maxnum_no_nans(<2 x bfloat> %a, <2 x bfloat> %b,
 ; CHECK-SM70-NEXT:    ld.param.b32 %r3, [fma_bf16x2_maxnum_no_nans_param_2];
 ; CHECK-SM70-NEXT:    mov.b32 {%rs1, %rs2}, %r3;
 ; CHECK-SM70-NEXT:    cvt.u32.u16 %r4, %rs1;
-; CHECK-SM70-NEXT:    shl.b32 %r5, %r4, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f1, %r5;
+; CHECK-SM70-NEXT:    shl.b32 %r55, %r4, 16;
 ; CHECK-SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r6, %rs3;
-; CHECK-SM70-NEXT:    shl.b32 %r7, %r6, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f2, %r7;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r7, %rs3;
+; CHECK-SM70-NEXT:    shl.b32 %r56, %r7, 16;
 ; CHECK-SM70-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r8, %rs5;
-; CHECK-SM70-NEXT:    shl.b32 %r9, %r8, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f3, %r9;
-; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
-; CHECK-SM70-NEXT:    mov.b32 %r10, %f4;
-; CHECK-SM70-NEXT:    bfe.u32 %r11, %r10, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r12, %r11, %r10;
-; CHECK-SM70-NEXT:    add.s32 %r13, %r12, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
-; CHECK-SM70-NEXT:    or.b32 %r14, %r10, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r15, %r14, %r13, %p1;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r16, %rs2;
-; CHECK-SM70-NEXT:    shl.b32 %r17, %r16, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f5, %r17;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r18, %rs4;
-; CHECK-SM70-NEXT:    shl.b32 %r19, %r18, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f6, %r19;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs6;
-; CHECK-SM70-NEXT:    shl.b32 %r21, %r20, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f7, %r21;
-; CHECK-SM70-NEXT:    fma.rn.f32 %f8, %f7, %f6, %f5;
-; CHECK-SM70-NEXT:    mov.b32 %r22, %f8;
-; CHECK-SM70-NEXT:    bfe.u32 %r23, %r22, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r24, %r23, %r22;
-; CHECK-SM70-NEXT:    add.s32 %r25, %r24, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f8, %f8;
-; CHECK-SM70-NEXT:    or.b32 %r26, %r22, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r27, %r26, %r25, %p2;
-; CHECK-SM70-NEXT:    and.b32 %r28, %r27, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f9, %r28;
-; CHECK-SM70-NEXT:    max.f32 %f10, %f9, 0f00000000;
-; CHECK-SM70-NEXT:    mov.b32 %r29, %f10;
-; CHECK-SM70-NEXT:    bfe.u32 %r30, %r29, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r31, %r30, %r29;
-; CHECK-SM70-NEXT:    add.s32 %r32, %r31, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p3, %f10, %f10;
-; CHECK-SM70-NEXT:    or.b32 %r33, %r29, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r34, %r33, %r32, %p3;
-; CHECK-SM70-NEXT:    and.b32 %r35, %r15, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f11, %r35;
-; CHECK-SM70-NEXT:    max.f32 %f12, %f11, 0f00000000;
-; CHECK-SM70-NEXT:    mov.b32 %r36, %f12;
-; CHECK-SM70-NEXT:    bfe.u32 %r37, %r36, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r38, %r37, %r36;
-; CHECK-SM70-NEXT:    add.s32 %r39, %r38, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p4, %f12, %f12;
-; CHECK-SM70-NEXT:    or.b32 %r40, %r36, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r41, %r40, %r39, %p4;
-; CHECK-SM70-NEXT:    prmt.b32 %r42, %r41, %r34, 0x7632U;
-; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r42;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r10, %rs5;
+; CHECK-SM70-NEXT:    shl.b32 %r57, %r10, 16;
+; CHECK-SM70-NEXT:    fma.rn.f32 %r58, %r57, %r56, %r55;
+; CHECK-SM70-NEXT:    bfe.u32 %r15, %r58, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r16, %r15, %r58;
+; CHECK-SM70-NEXT:    add.s32 %r17, %r16, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %r58, %r58;
+; CHECK-SM70-NEXT:    or.b32 %r18, %r58, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r19, %r18, %r17, %p1;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs2;
+; CHECK-SM70-NEXT:    shl.b32 %r59, %r20, 16;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r23, %rs4;
+; CHECK-SM70-NEXT:    shl.b32 %r60, %r23, 16;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r26, %rs6;
+; CHECK-SM70-NEXT:    shl.b32 %r61, %r26, 16;
+; CHECK-SM70-NEXT:    fma.rn.f32 %r62, %r61, %r60, %r59;
+; CHECK-SM70-NEXT:    bfe.u32 %r31, %r62, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r32, %r31, %r62;
+; CHECK-SM70-NEXT:    add.s32 %r33, %r32, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %r62, %r62;
+; CHECK-SM70-NEXT:    or.b32 %r34, %r62, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r35, %r34, %r33, %p2;
+; CHECK-SM70-NEXT:    and.b32 %r63, %r35, -65536;
+; CHECK-SM70-NEXT:    max.f32 %r64, %r63, 0f00000000;
+; CHECK-SM70-NEXT:    bfe.u32 %r40, %r64, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r41, %r40, %r64;
+; CHECK-SM70-NEXT:    add.s32 %r42, %r41, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p3, %r64, %r64;
+; CHECK-SM70-NEXT:    or.b32 %r43, %r64, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r44, %r43, %r42, %p3;
+; CHECK-SM70-NEXT:    and.b32 %r65, %r19, -65536;
+; CHECK-SM70-NEXT:    max.f32 %r66, %r65, 0f00000000;
+; CHECK-SM70-NEXT:    bfe.u32 %r49, %r66, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r50, %r49, %r66;
+; CHECK-SM70-NEXT:    add.s32 %r51, %r50, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p4, %r66, %r66;
+; CHECK-SM70-NEXT:    or.b32 %r52, %r66, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r53, %r52, %r51, %p4;
+; CHECK-SM70-NEXT:    prmt.b32 %r54, %r53, %r44, 0x7632U;
+; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r54;
 ; CHECK-SM70-NEXT:    ret;
   %1 = call nnan <2 x bfloat> @llvm.fma.bf16x2(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
   %2 = call nsz <2 x bfloat> @llvm.maxnum.bf16x2(<2 x bfloat> %1, <2 x bfloat> <bfloat 0.0, bfloat 0.0>)
diff --git a/llvm/test/CodeGen/NVPTX/fma.ll b/llvm/test/CodeGen/NVPTX/fma.ll
index 90fbd5ba9dfd6..327851725991e 100644
--- a/llvm/test/CodeGen/NVPTX/fma.ll
+++ b/llvm/test/CodeGen/NVPTX/fma.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -fp-contract=fast -verify-machineinstrs | FileCheck %s
 ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -fp-contract=fast -verify-machineinstrs | %ptxas-verify %}
 
@@ -5,17 +6,50 @@ declare float @dummy_f32(float, float) #0
 declare double @dummy_f64(double, double) #0
 
 define ptx_device float @t1_f32(float %x, float %y, float %z) {
-; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
-; CHECK: ret;
+; CHECK-LABEL: t1_f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [t1_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [t1_f32_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [t1_f32_param_2];
+; CHECK-NEXT:    fma.rn.f32 %r4, %r1, %r2, %r3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT:    ret;
   %a = fmul float %x, %y
   %b = fadd float %a, %z
   ret float %b
 }
 
 define ptx_device float @t2_f32(float %x, float %y, float %z, float %w) {
-; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
-; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
-; CHECK: ret;
+; CHECK-LABEL: t2_f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [t2_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [t2_f32_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [t2_f32_param_2];
+; CHECK-NEXT:    fma.rn.f32 %r4, %r1, %r2, %r3;
+; CHECK-NEXT:    ld.param.b32 %r5, [t2_f32_param_3];
+; CHECK-NEXT:    fma.rn.f32 %r6, %r1, %r2, %r5;
+; CHECK-NEXT:    { // callseq 0, 0
+; CHECK-NEXT:    .param .b32 param0;
+; CHECK-NEXT:    st.param.b32 [param0], %r4;
+; CHECK-NEXT:    .param .b32 param1;
+; CHECK-NEXT:    st.param.b32 [param1], %r6;
+; CHECK-NEXT:    .param .b32 retval0;
+; CHECK-NEXT:    call.uni (retval0),
+; CHECK-NEXT:    dummy_f32,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0,
+; CHECK-NEXT:    param1
+; CHECK-NEXT:    );
+; CHECK-NEXT:    ld.param.b32 %r7, [retval0];
+; CHECK-NEXT:    } // callseq 0
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r7;
+; CHECK-NEXT:    ret;
   %a = fmul float %x, %y
   %b = fadd float %a, %z
   %c = fadd float %a, %w
@@ -24,17 +58,50 @@ define ptx_device float @t2_f32(float %x, float %y, float %z, float %w) {
 }
 
 define ptx_device double @t1_f64(double %x, double %y, double %z) {
-; CHECK: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}};
-; CHECK: ret;
+; CHECK-LABEL: t1_f64(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [t1_f64_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [t1_f64_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd3, [t1_f64_param_2];
+; CHECK-NEXT:    fma.rn.f64 %rd4, %rd1, %rd2, %rd3;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd4;
+; CHECK-NEXT:    ret;
   %a = fmul double %x, %y
   %b = fadd double %a, %z
   ret double %b
 }
 
 define ptx_device double @t2_f64(double %x, double %y, double %z, double %w) {
-; CHECK: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}};
-; CHECK: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}};
-; CHECK: ret;
+; CHECK-LABEL: t2_f64(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [t2_f64_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [t2_f64_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd3, [t2_f64_param_2];
+; CHECK-NEXT:    fma.rn.f64 %rd4, %rd1, %rd2, %rd3;
+; CHECK-NEXT:    ld.param.b64 %rd5, [t2_f64_param_3];
+; CHECK-NEXT:    fma.rn.f64 %rd6, %rd1, %rd2, %rd5;
+; CHECK-NEXT:    { // callseq 1, 0
+; CHECK-NEXT:    .param .b64 param0;
+; CHECK-NEXT:    st.param.b64 [param0], %rd4;
+; CHECK-NEXT:    .param .b64 param1;
+; CHECK-NEXT:    st.param.b64 [param1], %rd6;
+; CHECK-NEXT:    .param .b64 retval0;
+; CHECK-NEXT:    call.uni (retval0),
+; CHECK-NEXT:    dummy_f64,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0,
+; CHECK-NEXT:    param1
+; CHECK-NEXT:    );
+; CHECK-NEXT:    ld.param.b64 %rd7, [retval0];
+; CHECK-NEXT:    } // callseq 1
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd7;
+; CHECK-NEXT:    ret;
   %a = fmul double %x, %y
   %b = fadd double %a, %z
   %c = fadd double %a, %w
@@ -43,15 +110,28 @@ define ptx_device double @t2_f64(double %x, double %y, double %z, double %w) {
 }
 
 define ptx_device float @f32_iir(float %x) {
-; CHECK: fma.rn.f32 %f{{[0-9]+}}, 0f52E8D4A5, 0f4A52FC54, %f{{[0-9]+}};
-; CHECK: ret;
+; CHECK-LABEL: f32_iir(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [f32_iir_param_0];
+; CHECK-NEXT:    fma.rn.f32 %r2, 0f52E8D4A5, 0f4A52FC54, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %r = call float @llvm.fma.f32(float 499999997952.0, float 3456789.0, float %x)
   ret float %r
 }
 
 define ptx_device float @f32_iii(float %x) {
-; CHECK: mov.b32 %f{{[0-9]+}}, 0f41200000;
-; CHECK: ret;
+; CHECK-LABEL: f32_iii(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.b32 %r1, 0f41200000;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
   %r = call float @llvm.fma.f32(float 2.0, float 3.0, float 4.0)
   ret float %r
 }
diff --git a/llvm/test/CodeGen/NVPTX/fp-contract.ll b/llvm/test/CodeGen/NVPTX/fp-contract.ll
index 89a402db8e42a..d2ba1395f4e62 100644
--- a/llvm/test/CodeGen/NVPTX/fp-contract.ll
+++ b/llvm/test/CodeGen/NVPTX/fp-contract.ll
@@ -15,27 +15,27 @@ target triple = "nvptx64-unknown-cuda"
 define float @t0(float %a, float %b, float %c) {
 ; FAST-LABEL: t0(
 ; FAST:       {
-; FAST-NEXT:    .reg .b32 %f<5>;
+; FAST-NEXT:    .reg .b32 %r<5>;
 ; FAST-EMPTY:
 ; FAST-NEXT:  // %bb.0:
-; FAST-NEXT:    ld.param.b32 %f1, [t0_param_0];
-; FAST-NEXT:    ld.param.b32 %f2, [t0_param_1];
-; FAST-NEXT:    ld.param.b32 %f3, [t0_param_2];
-; FAST-NEXT:    fma.rn.f32 %f4, %f1, %f2, %f3;
-; FAST-NEXT:    st.param.b32 [func_retval0], %f4;
+; FAST-NEXT:    ld.param.b32 %r1, [t0_param_0];
+; FAST-NEXT:    ld.param.b32 %r2, [t0_param_1];
+; FAST-NEXT:    ld.param.b32 %r3, [t0_param_2];
+; FAST-NEXT:    fma.rn.f32 %r4, %r1, %r2, %r3;
+; FAST-NEXT:    st.param.b32 [func_retval0], %r4;
 ; FAST-NEXT:    ret;
 ;
 ; DEFAULT-LABEL: t0(
 ; DEFAULT:       {
-; DEFAULT-NEXT:    .reg .b32 %f<6>;
+; DEFAULT-NEXT:    .reg .b32 %r<6>;
 ; DEFAULT-EMPTY:
 ; DEFAULT-NEXT:  // %bb.0:
-; DEFAULT-NEXT:    ld.param.b32 %f1, [t0_param_0];
-; DEFAULT-NEXT:    ld.param.b32 %f2, [t0_param_1];
-; DEFAULT-NEXT:    mul.rn.f32 %f3, %f1, %f2;
-; DEFAULT-NEXT:    ld.param.b32 %f4, [t0_param_2];
-; DEFAULT-NEXT:    add.rn.f32 %f5, %f3, %f4;
-; DEFAULT-NEXT:    st.param.b32 [func_retval0], %f5;
+; DEFAULT-NEXT:    ld.param.b32 %r1, [t0_param_0];
+; DEFAULT-NEXT:    ld.param.b32 %r2, [t0_param_1];
+; DEFAULT-NEXT:    mul.rn.f32 %r3, %r1, %r2;
+; DEFAULT-NEXT:    ld.param.b32 %r4, [t0_param_2];
+; DEFAULT-NEXT:    add.rn.f32 %r5, %r3, %r4;
+; DEFAULT-NEXT:    st.param.b32 [func_retval0], %r5;
 ; DEFAULT-NEXT:    ret;
   %v0 = fmul float %a, %b
   %v1 = fadd float %v0, %c
@@ -47,28 +47,28 @@ define float @t0(float %a, float %b, float %c) {
 define float @t1(float %a, float %b) {
 ; FAST-LABEL: t1(
 ; FAST:       {
-; FAST-NEXT:    .reg .b32 %f<6>;
+; FAST-NEXT:    .reg .b32 %r<6>;
 ; FAST-EMPTY:
 ; FAST-NEXT:  // %bb.0:
-; FAST-NEXT:    ld.param.b32 %f1, [t1_param_0];
-; FAST-NEXT:    ld.param.b32 %f2, [t1_param_1];
-; FAST-NEXT:    add.f32 %f3, %f1, %f2;
-; FAST-NEXT:    sub.f32 %f4, %f1, %f2;
-; FAST-NEXT:    mul.f32 %f5, %f3, %f4;
-; FAST-NEXT:    st.param.b32 [func_retval0], %f5;
+; FAST-NEXT:    ld.param.b32 %r1, [t1_param_0];
+; FAST-NEXT:    ld.param.b32 %r2, [t1_param_1];
+; FAST-NEXT:    add.f32 %r3, %r1, %r2;
+; FAST-NEXT:    sub.f32 %r4, %r1, %r2;
+; FAST-NEXT:    mul.f32 %r5, %r3, %r4;
+; FAST-NEXT:    st.param.b32 [func_retval0], %r5;
 ; FAST-NEXT:    ret;
 ;
 ; DEFAULT-LABEL: t1(
 ; DEFAULT:       {
-; DEFAULT-NEXT:    .reg .b32 %f<6>;
+; DEFAULT-NEXT:    .reg .b32 %r<6>;
 ; DEFAULT-EMPTY:
 ; DEFAULT-NEXT:  // %bb.0:
-; DEFAULT-NEXT:    ld.param.b32 %f1, [t1_param_0];
-; DEFAULT-NEXT:    ld.param.b32 %f2, [t1_param_1];
-; DEFAULT-NEXT:    add.rn.f32 %f3, %f1, %f2;
-; DEFAULT-NEXT:    sub.rn.f32 %f4, %f1, %f2;
-; DEFAULT-NEXT:    mul.rn.f32 %f5, %f3, %f4;
-; DEFAULT-NEXT:    st.param.b32 [func_retval0], %f5;
+; DEFAULT-NEXT:    ld.param.b32 %r1, [t1_param_0];
+; DEFAULT-NEXT:    ld.param.b32 %r2, [t1_param_1];
+; DEFAULT-NEXT:    add.rn.f32 %r3, %r1, %r2;
+; DEFAULT-NEXT:    sub.rn.f32 %r4, %r1, %r2;
+; DEFAULT-NEXT:    mul.rn.f32 %r5, %r3, %r4;
+; DEFAULT-NEXT:    st.param.b32 [func_retval0], %r5;
 ; DEFAULT-NEXT:    ret;
   %v1 = fadd float %a, %b
   %v2 = fsub float %a, %b
@@ -81,15 +81,15 @@ define float @t1(float %a, float %b) {
 define float @t2(float %a, float %b) {
 ; CHECK-LABEL: t2(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<6>;
+; CHECK-NEXT:    .reg .b32 %r<6>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [t2_param_0];
-; CHECK-NEXT:    ld.param.b32 %f2, [t2_param_1];
-; CHECK-NEXT:    add.f32 %f3, %f1, %f2;
-; CHECK-NEXT:    sub.f32 %f4, %f1, %f2;
-; CHECK-NEXT:    mul.f32 %f5, %f3, %f4;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f5;
+; CHECK-NEXT:    ld.param.b32 %r1, [t2_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [t2_param_1];
+; CHECK-NEXT:    add.f32 %r3, %r1, %r2;
+; CHECK-NEXT:    sub.f32 %r4, %r1, %r2;
+; CHECK-NEXT:    mul.f32 %r5, %r3, %r4;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r5;
 ; CHECK-NEXT:    ret;
   %v1 = fadd contract float %a, %b
   %v2 = fsub contract float %a, %b
@@ -101,14 +101,14 @@ define float @t2(float %a, float %b) {
 define float @t3(float %a, float %b, float %c) {
 ; CHECK-LABEL: t3(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [t3_param_0];
-; CHECK-NEXT:    ld.param.b32 %f2, [t3_param_1];
-; CHECK-NEXT:    ld.param.b32 %f3, [t3_param_2];
-; CHECK-NEXT:    fma.rn.f32 %f4, %f1, %f2, %f3;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f4;
+; CHECK-NEXT:    ld.param.b32 %r1, [t3_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [t3_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [t3_param_2];
+; CHECK-NEXT:    fma.rn.f32 %r4, %r1, %r2, %r3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
 ; CHECK-NEXT:    ret;
   %v0 = fmul contract float %a, %b
   %v1 = fadd contract float %v0, %c
diff --git a/llvm/test/CodeGen/NVPTX/fp-literals.ll b/llvm/test/CodeGen/NVPTX/fp-literals.ll
index 407cce9f1c74f..e3e63ffd890f7 100644
--- a/llvm/test/CodeGen/NVPTX/fp-literals.ll
+++ b/llvm/test/CodeGen/NVPTX/fp-literals.ll
@@ -8,14 +8,14 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; double-precision FP literals.
 
 ; CHECK: myaddf
-; CHECK: add.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, 0f3F800000
+; CHECK: add.f32 %r{{[0-9]+}}, %r{{[0-9]+}}, 0f3F800000
 define float @myaddf(float %a) {
   %ret = fadd float %a, 1.0
   ret float %ret
 }
 
 ; CHECK: myaddd
-; CHECK: add.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, 0d3FF0000000000000
+; CHECK: add.f64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, 0d3FF0000000000000
 define double @myaddd(double %a) {
   %ret = fadd double %a, 1.0
   ret double %ret
diff --git a/llvm/test/CodeGen/NVPTX/frem.ll b/llvm/test/CodeGen/NVPTX/frem.ll
index c0658f85205e8..909f2534f8219 100644
--- a/llvm/test/CodeGen/NVPTX/frem.ll
+++ b/llvm/test/CodeGen/NVPTX/frem.ll
@@ -9,18 +9,18 @@ define half @frem_f16(half %a, half %b) {
 ; FAST-LABEL: frem_f16(
 ; FAST:       {
 ; FAST-NEXT:    .reg .b16 %rs<4>;
-; FAST-NEXT:    .reg .b32 %f<7>;
+; FAST-NEXT:    .reg .b32 %r<7>;
 ; FAST-EMPTY:
 ; FAST-NEXT:  // %bb.0:
 ; FAST-NEXT:    ld.param.b16 %rs1, [frem_f16_param_0];
 ; FAST-NEXT:    ld.param.b16 %rs2, [frem_f16_param_1];
-; FAST-NEXT:    cvt.f32.f16 %f1, %rs2;
-; FAST-NEXT:    cvt.f32.f16 %f2, %rs1;
-; FAST-NEXT:    div.approx.f32 %f3, %f2, %f1;
-; FAST-NEXT:    cvt.rzi.f32.f32 %f4, %f3;
-; FAST-NEXT:    neg.f32 %f5, %f4;
-; FAST-NEXT:    fma.rn.f32 %f6, %f5, %f1, %f2;
-; FAST-NEXT:    cvt.rn.f16.f32 %rs3, %f6;
+; FAST-NEXT:    cvt.f32.f16 %r1, %rs2;
+; FAST-NEXT:    cvt.f32.f16 %r2, %rs1;
+; FAST-NEXT:    div.approx.f32 %r3, %r2, %r1;
+; FAST-NEXT:    cvt.rzi.f32.f32 %r4, %r3;
+; FAST-NEXT:    neg.f32 %r5, %r4;
+; FAST-NEXT:    fma.rn.f32 %r6, %r5, %r1, %r2;
+; FAST-NEXT:    cvt.rn.f16.f32 %rs3, %r6;
 ; FAST-NEXT:    st.param.b16 [func_retval0], %rs3;
 ; FAST-NEXT:    ret;
 ;
@@ -28,20 +28,20 @@ define half @frem_f16(half %a, half %b) {
 ; NORMAL:       {
 ; NORMAL-NEXT:    .reg .pred %p<2>;
 ; NORMAL-NEXT:    .reg .b16 %rs<4>;
-; NORMAL-NEXT:    .reg .b32 %f<8>;
+; NORMAL-NEXT:    .reg .b32 %r<8>;
 ; NORMAL-EMPTY:
 ; NORMAL-NEXT:  // %bb.0:
 ; NORMAL-NEXT:    ld.param.b16 %rs1, [frem_f16_param_0];
 ; NORMAL-NEXT:    ld.param.b16 %rs2, [frem_f16_param_1];
-; NORMAL-NEXT:    cvt.f32.f16 %f1, %rs2;
-; NORMAL-NEXT:    cvt.f32.f16 %f2, %rs1;
-; NORMAL-NEXT:    div.rn.f32 %f3, %f2, %f1;
-; NORMAL-NEXT:    cvt.rzi.f32.f32 %f4, %f3;
-; NORMAL-NEXT:    neg.f32 %f5, %f4;
-; NORMAL-NEXT:    fma.rn.f32 %f6, %f5, %f1, %f2;
-; NORMAL-NEXT:    testp.infinite.f32 %p1, %f1;
-; NORMAL-NEXT:    selp.f32 %f7, %f2, %f6, %p1;
-; NORMAL-NEXT:    cvt.rn.f16.f32 %rs3, %f7;
+; NORMAL-NEXT:    cvt.f32.f16 %r1, %rs2;
+; NORMAL-NEXT:    cvt.f32.f16 %r2, %rs1;
+; NORMAL-NEXT:    div.rn.f32 %r3, %r2, %r1;
+; NORMAL-NEXT:    cvt.rzi.f32.f32 %r4, %r3;
+; NORMAL-NEXT:    neg.f32 %r5, %r4;
+; NORMAL-NEXT:    fma.rn.f32 %r6, %r5, %r1, %r2;
+; NORMAL-NEXT:    testp.infinite.f32 %p1, %r1;
+; NORMAL-NEXT:    selp.f32 %r7, %r2, %r6, %p1;
+; NORMAL-NEXT:    cvt.rn.f16.f32 %rs3, %r7;
 ; NORMAL-NEXT:    st.param.b16 [func_retval0], %rs3;
 ; NORMAL-NEXT:    ret;
   %r = frem half %a, %b
@@ -51,33 +51,33 @@ define half @frem_f16(half %a, half %b) {
 define float @frem_f32(float %a, float %b) {
 ; FAST-LABEL: frem_f32(
 ; FAST:       {
-; FAST-NEXT:    .reg .b32 %f<7>;
+; FAST-NEXT:    .reg .b32 %r<7>;
 ; FAST-EMPTY:
 ; FAST-NEXT:  // %bb.0:
-; FAST-NEXT:    ld.param.b32 %f1, [frem_f32_param_0];
-; FAST-NEXT:    ld.param.b32 %f2, [frem_f32_param_1];
-; FAST-NEXT:    div.approx.f32 %f3, %f1, %f2;
-; FAST-NEXT:    cvt.rzi.f32.f32 %f4, %f3;
-; FAST-NEXT:    neg.f32 %f5, %f4;
-; FAST-NEXT:    fma.rn.f32 %f6, %f5, %f2, %f1;
-; FAST-NEXT:    st.param.b32 [func_retval0], %f6;
+; FAST-NEXT:    ld.param.b32 %r1, [frem_f32_param_0];
+; FAST-NEXT:    ld.param.b32 %r2, [frem_f32_param_1];
+; FAST-NEXT:    div.approx.f32 %r3, %r1, %r2;
+; FAST-NEXT:    cvt.rzi.f32.f32 %r4, %r3;
+; FAST-NEXT:    neg.f32 %r5, %r4;
+; FAST-NEXT:    fma.rn.f32 %r6, %r5, %r2, %r1;
+; FAST-NEXT:    st.param.b32 [func_retval0], %r6;
 ; FAST-NEXT:    ret;
 ;
 ; NORMAL-LABEL: frem_f32(
 ; NORMAL:       {
 ; NORMAL-NEXT:    .reg .pred %p<2>;
-; NORMAL-NEXT:    .reg .b32 %f<8>;
+; NORMAL-NEXT:    .reg .b32 %r<8>;
 ; NORMAL-EMPTY:
 ; NORMAL-NEXT:  // %bb.0:
-; NORMAL-NEXT:    ld.param.b32 %f1, [frem_f32_param_0];
-; NORMAL-NEXT:    ld.param.b32 %f2, [frem_f32_param_1];
-; NORMAL-NEXT:    div.rn.f32 %f3, %f1, %f2;
-; NORMAL-NEXT:    cvt.rzi.f32.f32 %f4, %f3;
-; NORMAL-NEXT:    neg.f32 %f5, %f4;
-; NORMAL-NEXT:    fma.rn.f32 %f6, %f5, %f2, %f1;
-; NORMAL-NEXT:    testp.infinite.f32 %p1, %f2;
-; NORMAL-NEXT:    selp.f32 %f7, %f1, %f6, %p1;
-; NORMAL-NEXT:    st.param.b32 [func_retval0], %f7;
+; NORMAL-NEXT:    ld.param.b32 %r1, [frem_f32_param_0];
+; NORMAL-NEXT:    ld.param.b32 %r2, [frem_f32_param_1];
+; NORMAL-NEXT:    div.rn.f32 %r3, %r1, %r2;
+; NORMAL-NEXT:    cvt.rzi.f32.f32 %r4, %r3;
+; NORMAL-NEXT:    neg.f32 %r5, %r4;
+; NORMAL-NEXT:    fma.rn.f32 %r6, %r5, %r2, %r1;
+; NORMAL-NEXT:    testp.infinite.f32 %p1, %r2;
+; NORMAL-NEXT:    selp.f32 %r7, %r1, %r6, %p1;
+; NORMAL-NEXT:    st.param.b32 [func_retval0], %r7;
 ; NORMAL-NEXT:    ret;
   %r = frem float %a, %b
   ret float %r
@@ -86,33 +86,33 @@ define float @frem_f32(float %a, float %b) {
 define double @frem_f64(double %a, double %b) {
 ; FAST-LABEL: frem_f64(
 ; FAST:       {
-; FAST-NEXT:    .reg .b64 %fd<7>;
+; FAST-NEXT:    .reg .b64 %rd<7>;
 ; FAST-EMPTY:
 ; FAST-NEXT:  // %bb.0:
-; FAST-NEXT:    ld.param.b64 %fd1, [frem_f64_param_0];
-; FAST-NEXT:    ld.param.b64 %fd2, [frem_f64_param_1];
-; FAST-NEXT:    div.rn.f64 %fd3, %fd1, %fd2;
-; FAST-NEXT:    cvt.rzi.f64.f64 %fd4, %fd3;
-; FAST-NEXT:    neg.f64 %fd5, %fd4;
-; FAST-NEXT:    fma.rn.f64 %fd6, %fd5, %fd2, %fd1;
-; FAST-NEXT:    st.param.b64 [func_retval0], %fd6;
+; FAST-NEXT:    ld.param.b64 %rd1, [frem_f64_param_0];
+; FAST-NEXT:    ld.param.b64 %rd2, [frem_f64_param_1];
+; FAST-NEXT:    div.rn.f64 %rd3, %rd1, %rd2;
+; FAST-NEXT:    cvt.rzi.f64.f64 %rd4, %rd3;
+; FAST-NEXT:    neg.f64 %rd5, %rd4;
+; FAST-NEXT:    fma.rn.f64 %rd6, %rd5, %rd2, %rd1;
+; FAST-NEXT:    st.param.b64 [func_retval0], %rd6;
 ; FAST-NEXT:    ret;
 ;
 ; NORMAL-LABEL: frem_f64(
 ; NORMAL:       {
 ; NORMAL-NEXT:    .reg .pred %p<2>;
-; NORMAL-NEXT:    .reg .b64 %fd<8>;
+; NORMAL-NEXT:    .reg .b64 %rd<8>;
 ; NORMAL-EMPTY:
 ; NORMAL-NEXT:  // %bb.0:
-; NORMAL-NEXT:    ld.param.b64 %fd1, [frem_f64_param_0];
-; NORMAL-NEXT:    ld.param.b64 %fd2, [frem_f64_param_1];
-; NORMAL-NEXT:    div.rn.f64 %fd3, %fd1, %fd2;
-; NORMAL-NEXT:    cvt.rzi.f64.f64 %fd4, %fd3;
-; NORMAL-NEXT:    neg.f64 %fd5, %fd4;
-; NORMAL-NEXT:    fma.rn.f64 %fd6, %fd5, %fd2, %fd1;
-; NORMAL-NEXT:    testp.infinite.f64 %p1, %fd2;
-; NORMAL-NEXT:    selp.f64 %fd7, %fd1, %fd6, %p1;
-; NORMAL-NEXT:    st.param.b64 [func_retval0], %fd7;
+; NORMAL-NEXT:    ld.param.b64 %rd1, [frem_f64_param_0];
+; NORMAL-NEXT:    ld.param.b64 %rd2, [frem_f64_param_1];
+; NORMAL-NEXT:    div.rn.f64 %rd3, %rd1, %rd2;
+; NORMAL-NEXT:    cvt.rzi.f64.f64 %rd4, %rd3;
+; NORMAL-NEXT:    neg.f64 %rd5, %rd4;
+; NORMAL-NEXT:    fma.rn.f64 %rd6, %rd5, %rd2, %rd1;
+; NORMAL-NEXT:    testp.infinite.f64 %p1, %rd2;
+; NORMAL-NEXT:    selp.f64 %rd7, %rd1, %rd6, %p1;
+; NORMAL-NEXT:    st.param.b64 [func_retval0], %rd7;
 ; NORMAL-NEXT:    ret;
   %r = frem double %a, %b
   ret double %r
@@ -122,36 +122,36 @@ define half @frem_f16_ninf(half %a, half %b) {
 ; FAST-LABEL: frem_f16_ninf(
 ; FAST:       {
 ; FAST-NEXT:    .reg .b16 %rs<4>;
-; FAST-NEXT:    .reg .b32 %f<7>;
+; FAST-NEXT:    .reg .b32 %r<7>;
 ; FAST-EMPTY:
 ; FAST-NEXT:  // %bb.0:
 ; FAST-NEXT:    ld.param.b16 %rs1, [frem_f16_ninf_param_0];
 ; FAST-NEXT:    ld.param.b16 %rs2, [frem_f16_ninf_param_1];
-; FAST-NEXT:    cvt.f32.f16 %f1, %rs2;
-; FAST-NEXT:    cvt.f32.f16 %f2, %rs1;
-; FAST-NEXT:    div.approx.f32 %f3, %f2, %f1;
-; FAST-NEXT:    cvt.rzi.f32.f32 %f4, %f3;
-; FAST-NEXT:    neg.f32 %f5, %f4;
-; FAST-NEXT:    fma.rn.f32 %f6, %f5, %f1, %f2;
-; FAST-NEXT:    cvt.rn.f16.f32 %rs3, %f6;
+; FAST-NEXT:    cvt.f32.f16 %r1, %rs2;
+; FAST-NEXT:    cvt.f32.f16 %r2, %rs1;
+; FAST-NEXT:    div.approx.f32 %r3, %r2, %r1;
+; FAST-NEXT:    cvt.rzi.f32.f32 %r4, %r3;
+; FAST-NEXT:    neg.f32 %r5, %r4;
+; FAST-NEXT:    fma.rn.f32 %r6, %r5, %r1, %r2;
+; FAST-NEXT:    cvt.rn.f16.f32 %rs3, %r6;
 ; FAST-NEXT:    st.param.b16 [func_retval0], %rs3;
 ; FAST-NEXT:    ret;
 ;
 ; NORMAL-LABEL: frem_f16_ninf(
 ; NORMAL:       {
 ; NORMAL-NEXT:    .reg .b16 %rs<4>;
-; NORMAL-NEXT:    .reg .b32 %f<7>;
+; NORMAL-NEXT:    .reg .b32 %r<7>;
 ; NORMAL-EMPTY:
 ; NORMAL-NEXT:  // %bb.0:
 ; NORMAL-NEXT:    ld.param.b16 %rs1, [frem_f16_ninf_param_0];
 ; NORMAL-NEXT:    ld.param.b16 %rs2, [frem_f16_ninf_param_1];
-; NORMAL-NEXT:    cvt.f32.f16 %f1, %rs2;
-; NORMAL-NEXT:    cvt.f32.f16 %f2, %rs1;
-; NORMAL-NEXT:    div.rn.f32 %f3, %f2, %f1;
-; NORMAL-NEXT:    cvt.rzi.f32.f32 %f4, %f3;
-; NORMAL-NEXT:    neg.f32 %f5, %f4;
-; NORMAL-NEXT:    fma.rn.f32 %f6, %f5, %f1, %f2;
-; NORMAL-NEXT:    cvt.rn.f16.f32 %rs3, %f6;
+; NORMAL-NEXT:    cvt.f32.f16 %r1, %rs2;
+; NORMAL-NEXT:    cvt.f32.f16 %r2, %rs1;
+; NORMAL-NEXT:    div.rn.f32 %r3, %r2, %r1;
+; NORMAL-NEXT:    cvt.rzi.f32.f32 %r4, %r3;
+; NORMAL-NEXT:    neg.f32 %r5, %r4;
+; NORMAL-NEXT:    fma.rn.f32 %r6, %r5, %r1, %r2;
+; NORMAL-NEXT:    cvt.rn.f16.f32 %rs3, %r6;
 ; NORMAL-NEXT:    st.param.b16 [func_retval0], %rs3;
 ; NORMAL-NEXT:    ret;
   %r = frem ninf half %a, %b
@@ -161,30 +161,30 @@ define half @frem_f16_ninf(half %a, half %b) {
 define float @frem_f32_ninf(float %a, float %b) {
 ; FAST-LABEL: frem_f32_ninf(
 ; FAST:       {
-; FAST-NEXT:    .reg .b32 %f<7>;
+; FAST-NEXT:    .reg .b32 %r<7>;
 ; FAST-EMPTY:
 ; FAST-NEXT:  // %bb.0:
-; FAST-NEXT:    ld.param.b32 %f1, [frem_f32_ninf_param_0];
-; FAST-NEXT:    ld.param.b32 %f2, [frem_f32_ninf_param_1];
-; FAST-NEXT:    div.approx.f32 %f3, %f1, %f2;
-; FAST-NEXT:    cvt.rzi.f32.f32 %f4, %f3;
-; FAST-NEXT:    neg.f32 %f5, %f4;
-; FAST-NEXT:    fma.rn.f32 %f6, %f5, %f2, %f1;
-; FAST-NEXT:    st.param.b32 [func_retval0], %f6;
+; FAST-NEXT:    ld.param.b32 %r1, [frem_f32_ninf_param_0];
+; FAST-NEXT:    ld.param.b32 %r2, [frem_f32_ninf_param_1];
+; FAST-NEXT:    div.approx.f32 %r3, %r1, %r2;
+; FAST-NEXT:    cvt.rzi.f32.f32 %r4, %r3;
+; FAST-NEXT:    neg.f32 %r5, %r4;
+; FAST-NEXT:    fma.rn.f32 %r6, %r5, %r2, %r1;
+; FAST-NEXT:    st.param.b32 [func_retval0], %r6;
 ; FAST-NEXT:    ret;
 ;
 ; NORMAL-LABEL: frem_f32_ninf(
 ; NORMAL:       {
-; NORMAL-NEXT:    .reg .b32 %f<7>;
+; NORMAL-NEXT:    .reg .b32 %r<7>;
 ; NORMAL-EMPTY:
 ; NORMAL-NEXT:  // %bb.0:
-; NORMAL-NEXT:    ld.param.b32 %f1, [frem_f32_ninf_param_0];
-; NORMAL-NEXT:    ld.param.b32 %f2, [frem_f32_ninf_param_1];
-; NORMAL-NEXT:    div.rn.f32 %f3, %f1, %f2;
-; NORMAL-NEXT:    cvt.rzi.f32.f32 %f4, %f3;
-; NORMAL-NEXT:    neg.f32 %f5, %f4;
-; NORMAL-NEXT:    fma.rn.f32 %f6, %f5, %f2, %f1;
-; NORMAL-NEXT:    st.param.b32 [func_retval0], %f6;
+; NORMAL-NEXT:    ld.param.b32 %r1, [frem_f32_ninf_param_0];
+; NORMAL-NEXT:    ld.param.b32 %r2, [frem_f32_ninf_param_1];
+; NORMAL-NEXT:    div.rn.f32 %r3, %r1, %r2;
+; NORMAL-NEXT:    cvt.rzi.f32.f32 %r4, %r3;
+; NORMAL-NEXT:    neg.f32 %r5, %r4;
+; NORMAL-NEXT:    fma.rn.f32 %r6, %r5, %r2, %r1;
+; NORMAL-NEXT:    st.param.b32 [func_retval0], %r6;
 ; NORMAL-NEXT:    ret;
   %r = frem ninf float %a, %b
   ret float %r
@@ -193,30 +193,30 @@ define float @frem_f32_ninf(float %a, float %b) {
 define double @frem_f64_ninf(double %a, double %b) {
 ; FAST-LABEL: frem_f64_ninf(
 ; FAST:       {
-; FAST-NEXT:    .reg .b64 %fd<7>;
+; FAST-NEXT:    .reg .b64 %rd<7>;
 ; FAST-EMPTY:
 ; FAST-NEXT:  // %bb.0:
-; FAST-NEXT:    ld.param.b64 %fd1, [frem_f64_ninf_param_0];
-; FAST-NEXT:    ld.param.b64 %fd2, [frem_f64_ninf_param_1];
-; FAST-NEXT:    div.rn.f64 %fd3, %fd1, %fd2;
-; FAST-NEXT:    cvt.rzi.f64.f64 %fd4, %fd3;
-; FAST-NEXT:    neg.f64 %fd5, %fd4;
-; FAST-NEXT:    fma.rn.f64 %fd6, %fd5, %fd2, %fd1;
-; FAST-NEXT:    st.param.b64 [func_retval0], %fd6;
+; FAST-NEXT:    ld.param.b64 %rd1, [frem_f64_ninf_param_0];
+; FAST-NEXT:    ld.param.b64 %rd2, [frem_f64_ninf_param_1];
+; FAST-NEXT:    div.rn.f64 %rd3, %rd1, %rd2;
+; FAST-NEXT:    cvt.rzi.f64.f64 %rd4, %rd3;
+; FAST-NEXT:    neg.f64 %rd5, %rd4;
+; FAST-NEXT:    fma.rn.f64 %rd6, %rd5, %rd2, %rd1;
+; FAST-NEXT:    st.param.b64 [func_retval0], %rd6;
 ; FAST-NEXT:    ret;
 ;
 ; NORMAL-LABEL: frem_f64_ninf(
 ; NORMAL:       {
-; NORMAL-NEXT:    .reg .b64 %fd<7>;
+; NORMAL-NEXT:    .reg .b64 %rd<7>;
 ; NORMAL-EMPTY:
 ; NORMAL-NEXT:  // %bb.0:
-; NORMAL-NEXT:    ld.param.b64 %fd1, [frem_f64_ninf_param_0];
-; NORMAL-NEXT:    ld.param.b64 %fd2, [frem_f64_ninf_param_1];
-; NORMAL-NEXT:    div.rn.f64 %fd3, %fd1, %fd2;
-; NORMAL-NEXT:    cvt.rzi.f64.f64 %fd4, %fd3;
-; NORMAL-NEXT:    neg.f64 %fd5, %fd4;
-; NORMAL-NEXT:    fma.rn.f64 %fd6, %fd5, %fd2, %fd1;
-; NORMAL-NEXT:    st.param.b64 [func_retval0], %fd6;
+; NORMAL-NEXT:    ld.param.b64 %rd1, [frem_f64_ninf_param_0];
+; NORMAL-NEXT:    ld.param.b64 %rd2, [frem_f64_ninf_param_1];
+; NORMAL-NEXT:    div.rn.f64 %rd3, %rd1, %rd2;
+; NORMAL-NEXT:    cvt.rzi.f64.f64 %rd4, %rd3;
+; NORMAL-NEXT:    neg.f64 %rd5, %rd4;
+; NORMAL-NEXT:    fma.rn.f64 %rd6, %rd5, %rd2, %rd1;
+; NORMAL-NEXT:    st.param.b64 [func_retval0], %rd6;
 ; NORMAL-NEXT:    ret;
   %r = frem ninf double %a, %b
   ret double %r
@@ -225,26 +225,26 @@ define double @frem_f64_ninf(double %a, double %b) {
 define float @frem_f32_imm1(float %a) {
 ; FAST-LABEL: frem_f32_imm1(
 ; FAST:       {
-; FAST-NEXT:    .reg .b32 %f<5>;
+; FAST-NEXT:    .reg .b32 %r<5>;
 ; FAST-EMPTY:
 ; FAST-NEXT:  // %bb.0:
-; FAST-NEXT:    ld.param.b32 %f1, [frem_f32_imm1_param_0];
-; FAST-NEXT:    mul.f32 %f2, %f1, 0f3E124925;
-; FAST-NEXT:    cvt.rzi.f32.f32 %f3, %f2;
-; FAST-NEXT:    fma.rn.f32 %f4, %f3, 0fC0E00000, %f1;
-; FAST-NEXT:    st.param.b32 [func_retval0], %f4;
+; FAST-NEXT:    ld.param.b32 %r1, [frem_f32_imm1_param_0];
+; FAST-NEXT:    mul.f32 %r2, %r1, 0f3E124925;
+; FAST-NEXT:    cvt.rzi.f32.f32 %r3, %r2;
+; FAST-NEXT:    fma.rn.f32 %r4, %r3, 0fC0E00000, %r1;
+; FAST-NEXT:    st.param.b32 [func_retval0], %r4;
 ; FAST-NEXT:    ret;
 ;
 ; NORMAL-LABEL: frem_f32_imm1(
 ; NORMAL:       {
-; NORMAL-NEXT:    .reg .b32 %f<5>;
+; NORMAL-NEXT:    .reg .b32 %r<5>;
 ; NORMAL-EMPTY:
 ; NORMAL-NEXT:  // %bb.0:
-; NORMAL-NEXT:    ld.param.b32 %f1, [frem_f32_imm1_param_0];
-; NORMAL-NEXT:    div.rn.f32 %f2, %f1, 0f40E00000;
-; NORMAL-NEXT:    cvt.rzi.f32.f32 %f3, %f2;
-; NORMAL-NEXT:    fma.rn.f32 %f4, %f3, 0fC0E00000, %f1;
-; NORMAL-NEXT:    st.param.b32 [func_retval0], %f4;
+; NORMAL-NEXT:    ld.param.b32 %r1, [frem_f32_imm1_param_0];
+; NORMAL-NEXT:    div.rn.f32 %r2, %r1, 0f40E00000;
+; NORMAL-NEXT:    cvt.rzi.f32.f32 %r3, %r2;
+; NORMAL-NEXT:    fma.rn.f32 %r4, %r3, 0fC0E00000, %r1;
+; NORMAL-NEXT:    st.param.b32 [func_retval0], %r4;
 ; NORMAL-NEXT:    ret;
   %r = frem float %a, 7.0
   ret float %r
@@ -253,33 +253,33 @@ define float @frem_f32_imm1(float %a) {
 define float @frem_f32_imm2(float %a) {
 ; FAST-LABEL: frem_f32_imm2(
 ; FAST:       {
-; FAST-NEXT:    .reg .b32 %f<7>;
+; FAST-NEXT:    .reg .b32 %r<7>;
 ; FAST-EMPTY:
 ; FAST-NEXT:  // %bb.0:
-; FAST-NEXT:    ld.param.b32 %f1, [frem_f32_imm2_param_0];
-; FAST-NEXT:    mov.b32 %f2, 0f40E00000;
-; FAST-NEXT:    div.approx.f32 %f3, %f2, %f1;
-; FAST-NEXT:    cvt.rzi.f32.f32 %f4, %f3;
-; FAST-NEXT:    neg.f32 %f5, %f4;
-; FAST-NEXT:    fma.rn.f32 %f6, %f5, %f1, 0f40E00000;
-; FAST-NEXT:    st.param.b32 [func_retval0], %f6;
+; FAST-NEXT:    ld.param.b32 %r1, [frem_f32_imm2_param_0];
+; FAST-NEXT:    mov.b32 %r2, 0f40E00000;
+; FAST-NEXT:    div.approx.f32 %r3, %r2, %r1;
+; FAST-NEXT:    cvt.rzi.f32.f32 %r4, %r3;
+; FAST-NEXT:    neg.f32 %r5, %r4;
+; FAST-NEXT:    fma.rn.f32 %r6, %r5, %r1, 0f40E00000;
+; FAST-NEXT:    st.param.b32 [func_retval0], %r6;
 ; FAST-NEXT:    ret;
 ;
 ; NORMAL-LABEL: frem_f32_imm2(
 ; NORMAL:       {
 ; NORMAL-NEXT:    .reg .pred %p<2>;
-; NORMAL-NEXT:    .reg .b32 %f<8>;
+; NORMAL-NEXT:    .reg .b32 %r<8>;
 ; NORMAL-EMPTY:
 ; NORMAL-NEXT:  // %bb.0:
-; NORMAL-NEXT:    ld.param.b32 %f1, [frem_f32_imm2_param_0];
-; NORMAL-NEXT:    mov.b32 %f2, 0f40E00000;
-; NORMAL-NEXT:    div.rn.f32 %f3, %f2, %f1;
-; NORMAL-NEXT:    cvt.rzi.f32.f32 %f4, %f3;
-; NORMAL-NEXT:    neg.f32 %f5, %f4;
-; NORMAL-NEXT:    fma.rn.f32 %f6, %f5, %f1, 0f40E00000;
-; NORMAL-NEXT:    testp.infinite.f32 %p1, %f1;
-; NORMAL-NEXT:    selp.f32 %f7, 0f40E00000, %f6, %p1;
-; NORMAL-NEXT:    st.param.b32 [func_retval0], %f7;
+; NORMAL-NEXT:    ld.param.b32 %r1, [frem_f32_imm2_param_0];
+; NORMAL-NEXT:    mov.b32 %r2, 0f40E00000;
+; NORMAL-NEXT:    div.rn.f32 %r3, %r2, %r1;
+; NORMAL-NEXT:    cvt.rzi.f32.f32 %r4, %r3;
+; NORMAL-NEXT:    neg.f32 %r5, %r4;
+; NORMAL-NEXT:    fma.rn.f32 %r6, %r5, %r1, 0f40E00000;
+; NORMAL-NEXT:    testp.infinite.f32 %p1, %r1;
+; NORMAL-NEXT:    selp.f32 %r7, 0f40E00000, %r6, %p1;
+; NORMAL-NEXT:    st.param.b32 [func_retval0], %r7;
 ; NORMAL-NEXT:    ret;
   %r = frem float 7.0, %a
   ret float %r
diff --git a/llvm/test/CodeGen/NVPTX/i1-int-to-fp.ll b/llvm/test/CodeGen/NVPTX/i1-int-to-fp.ll
index 169927bc0ac0f..e08e98234394f 100644
--- a/llvm/test/CodeGen/NVPTX/i1-int-to-fp.ll
+++ b/llvm/test/CodeGen/NVPTX/i1-int-to-fp.ll
@@ -4,7 +4,7 @@
 ; CHECK-LABEL: foo
 ; CHECK: setp.ne.b16 %[[P:p[0-9]+]], %{{.*}}, 0;
 ; CHECK: selp.b32 %[[R:r[0-9]+]], 1, 0, %[[P]];
-; CHECK: cvt.rn.f32.u32 %f{{.*}}, %[[R]]
+; CHECK: cvt.rn.f32.u32 %r{{.*}}, %[[R]]
 define float @foo(i1 %a) {
   %ret = uitofp i1 %a to float
   ret float %ret
@@ -13,7 +13,7 @@ define float @foo(i1 %a) {
 ; CHECK-LABEL: foo2
 ; CHECK: setp.ne.b16 %[[P:p[0-9]+]], %{{.*}}, 0;
 ; CHECK: selp.b32 %[[R:r[0-9]+]], -1, 0, %[[P]];
-; CHECK: cvt.rn.f32.s32 %f{{.*}}, %[[R]]
+; CHECK: cvt.rn.f32.s32 %r{{.*}}, %[[R]]
 define float @foo2(i1 %a) {
   %ret = sitofp i1 %a to float
   ret float %ret
@@ -22,7 +22,7 @@ define float @foo2(i1 %a) {
 ; CHECK-LABEL: foo3
 ; CHECK: setp.ne.b16 %[[P:p[0-9]+]], %{{.*}}, 0;
 ; CHECK: selp.b32 %[[R:r[0-9]+]], 1, 0, %[[P]];
-; CHECK: cvt.rn.f64.u32 %fd{{.*}}, %[[R]]
+; CHECK: cvt.rn.f64.u32 %rd{{.*}}, %[[R]]
 define double @foo3(i1 %a) {
   %ret = uitofp i1 %a to double
   ret double %ret
@@ -31,7 +31,7 @@ define double @foo3(i1 %a) {
 ; CHECK-LABEL: foo4
 ; CHECK: setp.ne.b16 %[[P:p[0-9]+]], %{{.*}}, 0;
 ; CHECK: selp.b32 %[[R:r[0-9]+]], -1, 0, %[[P]];
-; CHECK: cvt.rn.f64.s32 %fd{{.*}}, %[[R]]
+; CHECK: cvt.rn.f64.s32 %rd{{.*}}, %[[R]]
 define double @foo4(i1 %a) {
   %ret = sitofp i1 %a to double
   ret double %ret
diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
index 642d5d0e538a2..90847effb6d3f 100644
--- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
@@ -1140,13 +1140,12 @@ define <4 x i8> @test_bitcast_i32_to_4xi8(i32 %a) #0 {
 define <4 x i8> @test_bitcast_float_to_4xi8(float %a) #0 {
 ; CHECK-LABEL: test_bitcast_float_to_4xi8(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<2>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [test_bitcast_float_to_4xi8_param_0];
-; CHECK-NEXT:    mov.b32 %r1, %f1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ld.param.b32 %r1, [test_bitcast_float_to_4xi8_param_0];
+; CHECK-NEXT:    mov.b32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %r = bitcast float %a to <4 x i8>
   ret <4 x i8> %r
@@ -1168,13 +1167,12 @@ define i32 @test_bitcast_4xi8_to_i32(<4 x i8> %a) #0 {
 define float @test_bitcast_4xi8_to_float(<4 x i8> %a) #0 {
 ; CHECK-LABEL: test_bitcast_4xi8_to_float(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<2>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_bitcast_4xi8_to_float_param_0];
-; CHECK-NEXT:    mov.b32 %f1, %r1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f1;
+; CHECK-NEXT:    mov.b32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %r = bitcast <4 x i8> %a to float
   ret float %r
diff --git a/llvm/test/CodeGen/NVPTX/inline-asm.ll b/llvm/test/CodeGen/NVPTX/inline-asm.ll
index 901714ee3076e..8630a68789e3e 100644
--- a/llvm/test/CodeGen/NVPTX/inline-asm.ll
+++ b/llvm/test/CodeGen/NVPTX/inline-asm.ll
@@ -1,16 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s
 ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 define float @test(float %x) {
+; CHECK-LABEL: test(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    ld.param.b32 %r2, [test_param_0];
+; CHECK-NEXT:    // begin inline asm
+; CHECK-NEXT:    ex2.approx.ftz.f32 %r4, %r2;
+; CHECK-NEXT:    // end inline asm
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT:    ret;
 entry:
-; CHECK: ex2.approx.ftz.f32 %f{{[0-9]+}}, %f{{[0-9]+}}
   %0 = call float asm "ex2.approx.ftz.f32 $0, $1;", "=f,f"(float %x)
   ret float %0
 }
 
 define i32 @foo(i1 signext %cond, i32 %a, i32 %b) #0 {
+; CHECK-LABEL: foo(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    ld.param.b8 %rs1, [foo_param_0];
+; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
+; CHECK-NEXT:    setp.ne.b16 %p1, %rs2, 0;
+; CHECK-NEXT:    ld.param.b32 %r2, [foo_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [foo_param_2];
+; CHECK-NEXT:    // begin inline asm
+; CHECK-NEXT:    selp.b32 %r1, %r2, %r3, %p1;
+; CHECK-NEXT:    // end inline asm
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
 entry:
-; CHECK: selp.b32 %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %p{{[0-9]+}}
   %0 = tail call i32 asm "selp.b32 $0, $1, $2, $3;", "=r,r,r,b"(i32 %a, i32 %b, i1 %cond)
   ret i32 %0
 }
diff --git a/llvm/test/CodeGen/NVPTX/intrinsics.ll b/llvm/test/CodeGen/NVPTX/intrinsics.ll
index a8beeb287c225..4ed50632251cb 100644
--- a/llvm/test/CodeGen/NVPTX/intrinsics.ll
+++ b/llvm/test/CodeGen/NVPTX/intrinsics.ll
@@ -7,12 +7,12 @@
 define float @test_fabsf(float %f) {
 ; CHECK-LABEL: test_fabsf(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [test_fabsf_param_0];
-; CHECK-NEXT:    abs.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-NEXT:    ld.param.b32 %r1, [test_fabsf_param_0];
+; CHECK-NEXT:    abs.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %x = call float @llvm.fabs.f32(float %f)
   ret float %x
@@ -21,12 +21,12 @@ define float @test_fabsf(float %f) {
 define double @test_fabs(double %d) {
 ; CHECK-LABEL: test_fabs(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %fd<3>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b64 %fd1, [test_fabs_param_0];
-; CHECK-NEXT:    abs.f64 %fd2, %fd1;
-; CHECK-NEXT:    st.param.b64 [func_retval0], %fd2;
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_fabs_param_0];
+; CHECK-NEXT:    abs.f64 %rd2, %rd1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
 ; CHECK-NEXT:    ret;
   %x = call double @llvm.fabs.f64(double %d)
   ret double %x
@@ -35,12 +35,12 @@ define double @test_fabs(double %d) {
 define float @test_nvvm_sqrt(float %a) {
 ; CHECK-LABEL: test_nvvm_sqrt(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [test_nvvm_sqrt_param_0];
-; CHECK-NEXT:    sqrt.rn.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-NEXT:    ld.param.b32 %r1, [test_nvvm_sqrt_param_0];
+; CHECK-NEXT:    sqrt.rn.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %val = call float @llvm.nvvm.sqrt.f(float %a)
   ret float %val
@@ -49,12 +49,12 @@ define float @test_nvvm_sqrt(float %a) {
 define float @test_llvm_sqrt(float %a) {
 ; CHECK-LABEL: test_llvm_sqrt(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [test_llvm_sqrt_param_0];
-; CHECK-NEXT:    sqrt.rn.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-NEXT:    ld.param.b32 %r1, [test_llvm_sqrt_param_0];
+; CHECK-NEXT:    sqrt.rn.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %val = call float @llvm.sqrt.f32(float %a)
   ret float %val
diff --git a/llvm/test/CodeGen/NVPTX/ld-generic.ll b/llvm/test/CodeGen/NVPTX/ld-generic.ll
index ce922dd8a5ac9..ee304ca1601f4 100644
--- a/llvm/test/CodeGen/NVPTX/ld-generic.ll
+++ b/llvm/test/CodeGen/NVPTX/ld-generic.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64
 ; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %}
@@ -6,60 +7,156 @@
 
 ;; i8
 define i8 @ld_global_i8(ptr addrspace(0) %ptr) {
-; PTX32: ld.b8 %r{{[0-9]+}}, [%r{{[0-9]+}}]
-; PTX32: ret
-; PTX64: ld.b8 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
-; PTX64: ret
+; PTX32-LABEL: ld_global_i8(
+; PTX32:       {
+; PTX32-NEXT:    .reg .b32 %r<3>;
+; PTX32-EMPTY:
+; PTX32-NEXT:  // %bb.0:
+; PTX32-NEXT:    ld.param.b32 %r1, [ld_global_i8_param_0];
+; PTX32-NEXT:    ld.b8 %r2, [%r1];
+; PTX32-NEXT:    st.param.b32 [func_retval0], %r2;
+; PTX32-NEXT:    ret;
+;
+; PTX64-LABEL: ld_global_i8(
+; PTX64:       {
+; PTX64-NEXT:    .reg .b32 %r<2>;
+; PTX64-NEXT:    .reg .b64 %rd<2>;
+; PTX64-EMPTY:
+; PTX64-NEXT:  // %bb.0:
+; PTX64-NEXT:    ld.param.b64 %rd1, [ld_global_i8_param_0];
+; PTX64-NEXT:    ld.b8 %r1, [%rd1];
+; PTX64-NEXT:    st.param.b32 [func_retval0], %r1;
+; PTX64-NEXT:    ret;
   %a = load i8, ptr addrspace(0) %ptr
   ret i8 %a
 }
 
 ;; i16
 define i16 @ld_global_i16(ptr addrspace(0) %ptr) {
-; PTX32: ld.b16 %r{{[0-9]+}}, [%r{{[0-9]+}}]
-; PTX32: ret
-; PTX64: ld.b16 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
-; PTX64: ret
+; PTX32-LABEL: ld_global_i16(
+; PTX32:       {
+; PTX32-NEXT:    .reg .b32 %r<3>;
+; PTX32-EMPTY:
+; PTX32-NEXT:  // %bb.0:
+; PTX32-NEXT:    ld.param.b32 %r1, [ld_global_i16_param_0];
+; PTX32-NEXT:    ld.b16 %r2, [%r1];
+; PTX32-NEXT:    st.param.b32 [func_retval0], %r2;
+; PTX32-NEXT:    ret;
+;
+; PTX64-LABEL: ld_global_i16(
+; PTX64:       {
+; PTX64-NEXT:    .reg .b32 %r<2>;
+; PTX64-NEXT:    .reg .b64 %rd<2>;
+; PTX64-EMPTY:
+; PTX64-NEXT:  // %bb.0:
+; PTX64-NEXT:    ld.param.b64 %rd1, [ld_global_i16_param_0];
+; PTX64-NEXT:    ld.b16 %r1, [%rd1];
+; PTX64-NEXT:    st.param.b32 [func_retval0], %r1;
+; PTX64-NEXT:    ret;
   %a = load i16, ptr addrspace(0) %ptr
   ret i16 %a
 }
 
 ;; i32
 define i32 @ld_global_i32(ptr addrspace(0) %ptr) {
-; PTX32: ld.b32 %r{{[0-9]+}}, [%r{{[0-9]+}}]
-; PTX32: ret
-; PTX64: ld.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
-; PTX64: ret
+; PTX32-LABEL: ld_global_i32(
+; PTX32:       {
+; PTX32-NEXT:    .reg .b32 %r<3>;
+; PTX32-EMPTY:
+; PTX32-NEXT:  // %bb.0:
+; PTX32-NEXT:    ld.param.b32 %r1, [ld_global_i32_param_0];
+; PTX32-NEXT:    ld.b32 %r2, [%r1];
+; PTX32-NEXT:    st.param.b32 [func_retval0], %r2;
+; PTX32-NEXT:    ret;
+;
+; PTX64-LABEL: ld_global_i32(
+; PTX64:       {
+; PTX64-NEXT:    .reg .b32 %r<2>;
+; PTX64-NEXT:    .reg .b64 %rd<2>;
+; PTX64-EMPTY:
+; PTX64-NEXT:  // %bb.0:
+; PTX64-NEXT:    ld.param.b64 %rd1, [ld_global_i32_param_0];
+; PTX64-NEXT:    ld.b32 %r1, [%rd1];
+; PTX64-NEXT:    st.param.b32 [func_retval0], %r1;
+; PTX64-NEXT:    ret;
   %a = load i32, ptr addrspace(0) %ptr
   ret i32 %a
 }
 
 ;; i64
 define i64 @ld_global_i64(ptr addrspace(0) %ptr) {
-; PTX32: ld.b64 %rd{{[0-9]+}}, [%r{{[0-9]+}}]
-; PTX32: ret
-; PTX64: ld.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
-; PTX64: ret
+; PTX32-LABEL: ld_global_i64(
+; PTX32:       {
+; PTX32-NEXT:    .reg .b32 %r<2>;
+; PTX32-NEXT:    .reg .b64 %rd<2>;
+; PTX32-EMPTY:
+; PTX32-NEXT:  // %bb.0:
+; PTX32-NEXT:    ld.param.b32 %r1, [ld_global_i64_param_0];
+; PTX32-NEXT:    ld.b64 %rd1, [%r1];
+; PTX32-NEXT:    st.param.b64 [func_retval0], %rd1;
+; PTX32-NEXT:    ret;
+;
+; PTX64-LABEL: ld_global_i64(
+; PTX64:       {
+; PTX64-NEXT:    .reg .b64 %rd<3>;
+; PTX64-EMPTY:
+; PTX64-NEXT:  // %bb.0:
+; PTX64-NEXT:    ld.param.b64 %rd1, [ld_global_i64_param_0];
+; PTX64-NEXT:    ld.b64 %rd2, [%rd1];
+; PTX64-NEXT:    st.param.b64 [func_retval0], %rd2;
+; PTX64-NEXT:    ret;
   %a = load i64, ptr addrspace(0) %ptr
   ret i64 %a
 }
 
 ;; f32
 define float @ld_global_f32(ptr addrspace(0) %ptr) {
-; PTX32: ld.b32 %f{{[0-9]+}}, [%r{{[0-9]+}}]
-; PTX32: ret
-; PTX64: ld.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
-; PTX64: ret
+; PTX32-LABEL: ld_global_f32(
+; PTX32:       {
+; PTX32-NEXT:    .reg .b32 %r<3>;
+; PTX32-EMPTY:
+; PTX32-NEXT:  // %bb.0:
+; PTX32-NEXT:    ld.param.b32 %r1, [ld_global_f32_param_0];
+; PTX32-NEXT:    ld.b32 %r2, [%r1];
+; PTX32-NEXT:    st.param.b32 [func_retval0], %r2;
+; PTX32-NEXT:    ret;
+;
+; PTX64-LABEL: ld_global_f32(
+; PTX64:       {
+; PTX64-NEXT:    .reg .b32 %r<2>;
+; PTX64-NEXT:    .reg .b64 %rd<2>;
+; PTX64-EMPTY:
+; PTX64-NEXT:  // %bb.0:
+; PTX64-NEXT:    ld.param.b64 %rd1, [ld_global_f32_param_0];
+; PTX64-NEXT:    ld.b32 %r1, [%rd1];
+; PTX64-NEXT:    st.param.b32 [func_retval0], %r1;
+; PTX64-NEXT:    ret;
   %a = load float, ptr addrspace(0) %ptr
   ret float %a
 }
 
 ;; f64
 define double @ld_global_f64(ptr addrspace(0) %ptr) {
-; PTX32: ld.b64 %fd{{[0-9]+}}, [%r{{[0-9]+}}]
-; PTX32: ret
-; PTX64: ld.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
-; PTX64: ret
+; PTX32-LABEL: ld_global_f64(
+; PTX32:       {
+; PTX32-NEXT:    .reg .b32 %r<2>;
+; PTX32-NEXT:    .reg .b64 %rd<2>;
+; PTX32-EMPTY:
+; PTX32-NEXT:  // %bb.0:
+; PTX32-NEXT:    ld.param.b32 %r1, [ld_global_f64_param_0];
+; PTX32-NEXT:    ld.b64 %rd1, [%r1];
+; PTX32-NEXT:    st.param.b64 [func_retval0], %rd1;
+; PTX32-NEXT:    ret;
+;
+; PTX64-LABEL: ld_global_f64(
+; PTX64:       {
+; PTX64-NEXT:    .reg .b64 %rd<3>;
+; PTX64-EMPTY:
+; PTX64-NEXT:  // %bb.0:
+; PTX64-NEXT:    ld.param.b64 %rd1, [ld_global_f64_param_0];
+; PTX64-NEXT:    ld.b64 %rd2, [%rd1];
+; PTX64-NEXT:    st.param.b64 [func_retval0], %rd2;
+; PTX64-NEXT:    ret;
   %a = load double, ptr addrspace(0) %ptr
   ret double %a
 }
diff --git a/llvm/test/CodeGen/NVPTX/ld-st-addrrspace.py b/llvm/test/CodeGen/NVPTX/ld-st-addrrspace.py
index 3f0c6b0291251..2fa4c89f4d71c 100644
--- a/llvm/test/CodeGen/NVPTX/ld-st-addrrspace.py
+++ b/llvm/test/CodeGen/NVPTX/ld-st-addrrspace.py
@@ -42,8 +42,8 @@
     "i64": "rd",
     "half": "rs",
     "<2 x half>": "r",
-    "float": "f",
-    "double": "fd",
+    "float": "r",
+    "double": "rd",
 }
 
 addrspace_id = {
diff --git a/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll b/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll
index 6fc698011dd42..b4a74c762f523 100644
--- a/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll
+++ b/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll
@@ -385,39 +385,39 @@ define i32 @ld_global_v8i32(ptr addrspace(1) %ptr) {
 define float @ld_global_v8f32(ptr addrspace(1) %ptr) {
 ; SM90-LABEL: ld_global_v8f32(
 ; SM90:       {
-; SM90-NEXT:    .reg .b32 %f<16>;
+; SM90-NEXT:    .reg .b32 %r<16>;
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
 ; SM90-NEXT:    ld.param.b64 %rd1, [ld_global_v8f32_param_0];
-; SM90-NEXT:    ld.global.nc.v4.b32 {%f1, %f2, %f3, %f4}, [%rd1+16];
-; SM90-NEXT:    ld.global.nc.v4.b32 {%f5, %f6, %f7, %f8}, [%rd1];
-; SM90-NEXT:    add.rn.f32 %f9, %f5, %f6;
-; SM90-NEXT:    add.rn.f32 %f10, %f7, %f8;
-; SM90-NEXT:    add.rn.f32 %f11, %f1, %f2;
-; SM90-NEXT:    add.rn.f32 %f12, %f3, %f4;
-; SM90-NEXT:    add.rn.f32 %f13, %f9, %f10;
-; SM90-NEXT:    add.rn.f32 %f14, %f11, %f12;
-; SM90-NEXT:    add.rn.f32 %f15, %f13, %f14;
-; SM90-NEXT:    st.param.b32 [func_retval0], %f15;
+; SM90-NEXT:    ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1+16];
+; SM90-NEXT:    ld.global.nc.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1];
+; SM90-NEXT:    add.rn.f32 %r9, %r5, %r6;
+; SM90-NEXT:    add.rn.f32 %r10, %r7, %r8;
+; SM90-NEXT:    add.rn.f32 %r11, %r1, %r2;
+; SM90-NEXT:    add.rn.f32 %r12, %r3, %r4;
+; SM90-NEXT:    add.rn.f32 %r13, %r9, %r10;
+; SM90-NEXT:    add.rn.f32 %r14, %r11, %r12;
+; SM90-NEXT:    add.rn.f32 %r15, %r13, %r14;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r15;
 ; SM90-NEXT:    ret;
 ;
 ; SM100-LABEL: ld_global_v8f32(
 ; SM100:       {
-; SM100-NEXT:    .reg .b32 %f<16>;
+; SM100-NEXT:    .reg .b32 %r<16>;
 ; SM100-NEXT:    .reg .b64 %rd<2>;
 ; SM100-EMPTY:
 ; SM100-NEXT:  // %bb.0:
 ; SM100-NEXT:    ld.param.b64 %rd1, [ld_global_v8f32_param_0];
-; SM100-NEXT:    ld.global.nc.v8.b32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [%rd1];
-; SM100-NEXT:    add.rn.f32 %f9, %f1, %f2;
-; SM100-NEXT:    add.rn.f32 %f10, %f3, %f4;
-; SM100-NEXT:    add.rn.f32 %f11, %f5, %f6;
-; SM100-NEXT:    add.rn.f32 %f12, %f7, %f8;
-; SM100-NEXT:    add.rn.f32 %f13, %f9, %f10;
-; SM100-NEXT:    add.rn.f32 %f14, %f11, %f12;
-; SM100-NEXT:    add.rn.f32 %f15, %f13, %f14;
-; SM100-NEXT:    st.param.b32 [func_retval0], %f15;
+; SM100-NEXT:    ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
+; SM100-NEXT:    add.rn.f32 %r9, %r1, %r2;
+; SM100-NEXT:    add.rn.f32 %r10, %r3, %r4;
+; SM100-NEXT:    add.rn.f32 %r11, %r5, %r6;
+; SM100-NEXT:    add.rn.f32 %r12, %r7, %r8;
+; SM100-NEXT:    add.rn.f32 %r13, %r9, %r10;
+; SM100-NEXT:    add.rn.f32 %r14, %r11, %r12;
+; SM100-NEXT:    add.rn.f32 %r15, %r13, %r14;
+; SM100-NEXT:    st.param.b32 [func_retval0], %r15;
 ; SM100-NEXT:    ret;
   %a = load <8 x float>, ptr addrspace(1) %ptr, !invariant.load !0
   %v1 = extractelement <8 x float> %a, i32 0
@@ -480,31 +480,29 @@ define i64 @ld_global_v4i64(ptr addrspace(1) %ptr) {
 define double @ld_global_v4f64(ptr addrspace(1) %ptr) {
 ; SM90-LABEL: ld_global_v4f64(
 ; SM90:       {
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-NEXT:    .reg .b64 %fd<8>;
+; SM90-NEXT:    .reg .b64 %rd<9>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
 ; SM90-NEXT:    ld.param.b64 %rd1, [ld_global_v4f64_param_0];
-; SM90-NEXT:    ld.global.nc.v2.b64 {%fd1, %fd2}, [%rd1+16];
-; SM90-NEXT:    ld.global.nc.v2.b64 {%fd3, %fd4}, [%rd1];
-; SM90-NEXT:    add.rn.f64 %fd5, %fd3, %fd4;
-; SM90-NEXT:    add.rn.f64 %fd6, %fd1, %fd2;
-; SM90-NEXT:    add.rn.f64 %fd7, %fd5, %fd6;
-; SM90-NEXT:    st.param.b64 [func_retval0], %fd7;
+; SM90-NEXT:    ld.global.nc.v2.b64 {%rd2, %rd3}, [%rd1+16];
+; SM90-NEXT:    ld.global.nc.v2.b64 {%rd4, %rd5}, [%rd1];
+; SM90-NEXT:    add.rn.f64 %rd6, %rd4, %rd5;
+; SM90-NEXT:    add.rn.f64 %rd7, %rd2, %rd3;
+; SM90-NEXT:    add.rn.f64 %rd8, %rd6, %rd7;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd8;
 ; SM90-NEXT:    ret;
 ;
 ; SM100-LABEL: ld_global_v4f64(
 ; SM100:       {
-; SM100-NEXT:    .reg .b64 %rd<2>;
-; SM100-NEXT:    .reg .b64 %fd<8>;
+; SM100-NEXT:    .reg .b64 %rd<9>;
 ; SM100-EMPTY:
 ; SM100-NEXT:  // %bb.0:
 ; SM100-NEXT:    ld.param.b64 %rd1, [ld_global_v4f64_param_0];
-; SM100-NEXT:    ld.global.nc.v4.b64 {%fd1, %fd2, %fd3, %fd4}, [%rd1];
-; SM100-NEXT:    add.rn.f64 %fd5, %fd1, %fd2;
-; SM100-NEXT:    add.rn.f64 %fd6, %fd3, %fd4;
-; SM100-NEXT:    add.rn.f64 %fd7, %fd5, %fd6;
-; SM100-NEXT:    st.param.b64 [func_retval0], %fd7;
+; SM100-NEXT:    ld.global.nc.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1];
+; SM100-NEXT:    add.rn.f64 %rd6, %rd2, %rd3;
+; SM100-NEXT:    add.rn.f64 %rd7, %rd4, %rd5;
+; SM100-NEXT:    add.rn.f64 %rd8, %rd6, %rd7;
+; SM100-NEXT:    st.param.b64 [func_retval0], %rd8;
 ; SM100-NEXT:    ret;
   %a = load <4 x double>, ptr addrspace(1) %ptr, !invariant.load !0
   %v1 = extractelement <4 x double> %a, i32 0
diff --git a/llvm/test/CodeGen/NVPTX/ldg-invariant.ll b/llvm/test/CodeGen/NVPTX/ldg-invariant.ll
index 3a342e4d838c6..c5c5de4c1b85e 100644
--- a/llvm/test/CodeGen/NVPTX/ldg-invariant.ll
+++ b/llvm/test/CodeGen/NVPTX/ldg-invariant.ll
@@ -26,18 +26,17 @@ define half @ld_global_v2f16(ptr addrspace(1) %ptr) {
 ; CHECK-LABEL: ld_global_v2f16(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<4>;
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<4>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [ld_global_v2f16_param_0];
 ; CHECK-NEXT:    ld.global.nc.b32 %r1, [%rd1];
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
-; CHECK-NEXT:    cvt.f32.f16 %f1, %rs2;
-; CHECK-NEXT:    cvt.f32.f16 %f2, %rs1;
-; CHECK-NEXT:    add.rn.f32 %f3, %f2, %f1;
-; CHECK-NEXT:    cvt.rn.f16.f32 %rs3, %f3;
+; CHECK-NEXT:    cvt.f32.f16 %r2, %rs2;
+; CHECK-NEXT:    cvt.f32.f16 %r3, %rs1;
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, %r2;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs3, %r4;
 ; CHECK-NEXT:    st.param.b16 [func_retval0], %rs3;
 ; CHECK-NEXT:    ret;
   %a = load <2 x half>, ptr addrspace(1) %ptr, !invariant.load !0
@@ -54,24 +53,24 @@ define half @ld_global_v4f16(ptr addrspace(1) %ptr) {
 ; CHECK-LABEL: ld_global_v4f16(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<8>;
-; CHECK-NEXT:    .reg .b32 %f<10>;
+; CHECK-NEXT:    .reg .b32 %r<10>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [ld_global_v4f16_param_0];
 ; CHECK-NEXT:    ld.global.nc.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
-; CHECK-NEXT:    cvt.f32.f16 %f1, %rs2;
-; CHECK-NEXT:    cvt.f32.f16 %f2, %rs1;
-; CHECK-NEXT:    add.rn.f32 %f3, %f2, %f1;
-; CHECK-NEXT:    cvt.rn.f16.f32 %rs5, %f3;
-; CHECK-NEXT:    cvt.f32.f16 %f4, %rs4;
-; CHECK-NEXT:    cvt.f32.f16 %f5, %rs3;
-; CHECK-NEXT:    add.rn.f32 %f6, %f5, %f4;
-; CHECK-NEXT:    cvt.rn.f16.f32 %rs6, %f6;
-; CHECK-NEXT:    cvt.f32.f16 %f7, %rs6;
-; CHECK-NEXT:    cvt.f32.f16 %f8, %rs5;
-; CHECK-NEXT:    add.rn.f32 %f9, %f8, %f7;
-; CHECK-NEXT:    cvt.rn.f16.f32 %rs7, %f9;
+; CHECK-NEXT:    cvt.f32.f16 %r1, %rs2;
+; CHECK-NEXT:    cvt.f32.f16 %r2, %rs1;
+; CHECK-NEXT:    add.rn.f32 %r3, %r2, %r1;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs5, %r3;
+; CHECK-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-NEXT:    cvt.f32.f16 %r5, %rs3;
+; CHECK-NEXT:    add.rn.f32 %r6, %r5, %r4;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs6, %r6;
+; CHECK-NEXT:    cvt.f32.f16 %r7, %rs6;
+; CHECK-NEXT:    cvt.f32.f16 %r8, %rs5;
+; CHECK-NEXT:    add.rn.f32 %r9, %r8, %r7;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs7, %r9;
 ; CHECK-NEXT:    st.param.b16 [func_retval0], %rs7;
 ; CHECK-NEXT:    ret;
   %a = load <4 x half>, ptr addrspace(1) %ptr, !invariant.load !0
@@ -91,8 +90,7 @@ define half @ld_global_v8f16(ptr addrspace(1) %ptr) {
 ; CHECK-LABEL: ld_global_v8f16(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<8>;
-; CHECK-NEXT:    .reg .b32 %r<5>;
-; CHECK-NEXT:    .reg .b32 %f<10>;
+; CHECK-NEXT:    .reg .b32 %r<14>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
@@ -102,18 +100,18 @@ define half @ld_global_v8f16(ptr addrspace(1) %ptr) {
 ; CHECK-NEXT:    { .reg .b16 tmp; mov.b32 {%rs2, tmp}, %r4; }
 ; CHECK-NEXT:    { .reg .b16 tmp; mov.b32 {%rs3, tmp}, %r1; }
 ; CHECK-NEXT:    { .reg .b16 tmp; mov.b32 {%rs4, tmp}, %r2; }
-; CHECK-NEXT:    cvt.f32.f16 %f1, %rs4;
-; CHECK-NEXT:    cvt.f32.f16 %f2, %rs3;
-; CHECK-NEXT:    add.rn.f32 %f3, %f2, %f1;
-; CHECK-NEXT:    cvt.rn.f16.f32 %rs5, %f3;
-; CHECK-NEXT:    cvt.f32.f16 %f4, %rs2;
-; CHECK-NEXT:    cvt.f32.f16 %f5, %rs1;
-; CHECK-NEXT:    add.rn.f32 %f6, %f5, %f4;
-; CHECK-NEXT:    cvt.rn.f16.f32 %rs6, %f6;
-; CHECK-NEXT:    cvt.f32.f16 %f7, %rs6;
-; CHECK-NEXT:    cvt.f32.f16 %f8, %rs5;
-; CHECK-NEXT:    add.rn.f32 %f9, %f8, %f7;
-; CHECK-NEXT:    cvt.rn.f16.f32 %rs7, %f9;
+; CHECK-NEXT:    cvt.f32.f16 %r5, %rs4;
+; CHECK-NEXT:    cvt.f32.f16 %r6, %rs3;
+; CHECK-NEXT:    add.rn.f32 %r7, %r6, %r5;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs5, %r7;
+; CHECK-NEXT:    cvt.f32.f16 %r8, %rs2;
+; CHECK-NEXT:    cvt.f32.f16 %r9, %rs1;
+; CHECK-NEXT:    add.rn.f32 %r10, %r9, %r8;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs6, %r10;
+; CHECK-NEXT:    cvt.f32.f16 %r11, %rs6;
+; CHECK-NEXT:    cvt.f32.f16 %r12, %rs5;
+; CHECK-NEXT:    add.rn.f32 %r13, %r12, %r11;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs7, %r13;
 ; CHECK-NEXT:    st.param.b16 [func_retval0], %rs7;
 ; CHECK-NEXT:    ret;
   %a = load <8 x half>, ptr addrspace(1) %ptr, !invariant.load !0
diff --git a/llvm/test/CodeGen/NVPTX/ldparam-v4.ll b/llvm/test/CodeGen/NVPTX/ldparam-v4.ll
index 0a528f0e8da06..419c780f7d82a 100644
--- a/llvm/test/CodeGen/NVPTX/ldparam-v4.ll
+++ b/llvm/test/CodeGen/NVPTX/ldparam-v4.ll
@@ -1,13 +1,27 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s
 ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 declare <4 x float> @bar()
 
-; CHECK-LABEL: .func foo(
 define void @foo(ptr %ptr) {
-; CHECK:     ld.param.b64 %[[PTR:rd[0-9]+]], [foo_param_0];
-; CHECK:     ld.param.v4.b32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]], [[E2:%f[0-9]+]], [[E3:%f[0-9]+]]}, [retval0];
-; CHECK:     st.v4.b32    [%[[PTR]]], {[[E0]], [[E1]], [[E2]], [[E3]]}
+; CHECK-LABEL: foo(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [foo_param_0];
+; CHECK-NEXT:    { // callseq 0, 0
+; CHECK-NEXT:    .param .align 16 .b8 retval0[16];
+; CHECK-NEXT:    call.uni (retval0),
+; CHECK-NEXT:    bar,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    );
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [retval0];
+; CHECK-NEXT:    } // callseq 0
+; CHECK-NEXT:    st.v4.b32 [%rd1], {%r1, %r2, %r3, %r4};
+; CHECK-NEXT:    ret;
   %val = tail call <4 x float> @bar()
   store <4 x float> %val, ptr %ptr
   ret void
diff --git a/llvm/test/CodeGen/NVPTX/ldu-ldg.ll b/llvm/test/CodeGen/NVPTX/ldu-ldg.ll
index be2e896f57009..7ac697c4ce203 100644
--- a/llvm/test/CodeGen/NVPTX/ldu-ldg.ll
+++ b/llvm/test/CodeGen/NVPTX/ldu-ldg.ll
@@ -104,13 +104,13 @@ define ptr @test_ldu_p(ptr addrspace(1) %ptr) {
 define float @test_ldu_f32(ptr addrspace(1) %ptr) {
 ; CHECK-LABEL: test_ldu_f32(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<2>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldu_f32_param_0];
-; CHECK-NEXT:    ldu.global.b32 %f1, [%rd1];
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f1;
+; CHECK-NEXT:    ldu.global.b32 %r1, [%rd1];
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
   %val = tail call float @llvm.nvvm.ldu.global.f.f32.p1(ptr addrspace(1) %ptr, i32 4)
   ret float %val
@@ -119,13 +119,12 @@ define float @test_ldu_f32(ptr addrspace(1) %ptr) {
 define double @test_ldu_f64(ptr addrspace(1) %ptr) {
 ; CHECK-LABEL: test_ldu_f64(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<2>;
-; CHECK-NEXT:    .reg .b64 %fd<2>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldu_f64_param_0];
-; CHECK-NEXT:    ldu.global.b64 %fd1, [%rd1];
-; CHECK-NEXT:    st.param.b64 [func_retval0], %fd1;
+; CHECK-NEXT:    ldu.global.b64 %rd2, [%rd1];
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
 ; CHECK-NEXT:    ret;
   %val = tail call double @llvm.nvvm.ldu.global.f.f64.p1(ptr addrspace(1) %ptr, i32 8)
   ret double %val
@@ -241,13 +240,13 @@ define ptr @test_ldg_p(ptr addrspace(1) %ptr) {
 define float @test_ldg_f32(ptr addrspace(1) %ptr) {
 ; CHECK-LABEL: test_ldg_f32(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<2>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldg_f32_param_0];
-; CHECK-NEXT:    ld.global.nc.b32 %f1, [%rd1];
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f1;
+; CHECK-NEXT:    ld.global.nc.b32 %r1, [%rd1];
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
   %val = tail call float @llvm.nvvm.ldg.global.f.f32.p1(ptr addrspace(1) %ptr, i32 4)
   ret float %val
@@ -256,13 +255,12 @@ define float @test_ldg_f32(ptr addrspace(1) %ptr) {
 define double @test_ldg_f64(ptr addrspace(1) %ptr) {
 ; CHECK-LABEL: test_ldg_f64(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<2>;
-; CHECK-NEXT:    .reg .b64 %fd<2>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldg_f64_param_0];
-; CHECK-NEXT:    ld.global.nc.b64 %fd1, [%rd1];
-; CHECK-NEXT:    st.param.b64 [func_retval0], %fd1;
+; CHECK-NEXT:    ld.global.nc.b64 %rd2, [%rd1];
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
 ; CHECK-NEXT:    ret;
   %val = tail call double @llvm.nvvm.ldg.global.f.f64.p1(ptr addrspace(1) %ptr, i32 8)
   ret double %val
diff --git a/llvm/test/CodeGen/NVPTX/load-store-256-addressing-invariant.ll b/llvm/test/CodeGen/NVPTX/load-store-256-addressing-invariant.ll
index 5d974cef0d475..a9bd3c1caebe5 100644
--- a/llvm/test/CodeGen/NVPTX/load-store-256-addressing-invariant.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store-256-addressing-invariant.ll
@@ -110,11 +110,11 @@ define void @avar_i64() {
 define void @avar_float() {
 ; PTX-LABEL: avar_float(
 ; PTX:       {
-; PTX-NEXT:    .reg .b32 %f<9>;
+; PTX-NEXT:    .reg .b32 %r<9>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
-; PTX-NEXT:    ld.global.nc.v8.b32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [globalin];
-; PTX-NEXT:    st.global.v8.b32 [globalout], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8};
+; PTX-NEXT:    ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin];
+; PTX-NEXT:    st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
 ; PTX-NEXT:    ret;
   %load = load <8 x float>, ptr addrspace(1) @globalin, !invariant.load !0
   store <8 x float> %load, ptr addrspace(1) @globalout
@@ -124,11 +124,11 @@ define void @avar_float() {
 define void @avar_double() {
 ; PTX-LABEL: avar_double(
 ; PTX:       {
-; PTX-NEXT:    .reg .b64 %fd<5>;
+; PTX-NEXT:    .reg .b64 %rd<5>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
-; PTX-NEXT:    ld.global.nc.v4.b64 {%fd1, %fd2, %fd3, %fd4}, [globalin];
-; PTX-NEXT:    st.global.v4.b64 [globalout], {%fd1, %fd2, %fd3, %fd4};
+; PTX-NEXT:    ld.global.nc.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin];
+; PTX-NEXT:    st.global.v4.b64 [globalout], {%rd1, %rd2, %rd3, %rd4};
 ; PTX-NEXT:    ret;
   %load = load <4 x double>, ptr addrspace(1) @globalin, !invariant.load !0
   store <4 x double> %load, ptr addrspace(1) @globalout
@@ -234,11 +234,11 @@ define void @asi_i64() {
 define void @asi_float() {
 ; PTX-LABEL: asi_float(
 ; PTX:       {
-; PTX-NEXT:    .reg .b32 %f<9>;
+; PTX-NEXT:    .reg .b32 %r<9>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
-; PTX-NEXT:    ld.global.nc.v8.b32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [globalin+32];
-; PTX-NEXT:    st.global.v8.b32 [globalout+32], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8};
+; PTX-NEXT:    ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32];
+; PTX-NEXT:    st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
 ; PTX-NEXT:    ret;
   %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32
   %load = load <8 x float>, ptr addrspace(1) %in.offset, !invariant.load !0
@@ -250,11 +250,11 @@ define void @asi_float() {
 define void @asi_double() {
 ; PTX-LABEL: asi_double(
 ; PTX:       {
-; PTX-NEXT:    .reg .b64 %fd<5>;
+; PTX-NEXT:    .reg .b64 %rd<5>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
-; PTX-NEXT:    ld.global.nc.v4.b64 {%fd1, %fd2, %fd3, %fd4}, [globalin+32];
-; PTX-NEXT:    st.global.v4.b64 [globalout+32], {%fd1, %fd2, %fd3, %fd4};
+; PTX-NEXT:    ld.global.nc.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin+32];
+; PTX-NEXT:    st.global.v4.b64 [globalout+32], {%rd1, %rd2, %rd3, %rd4};
 ; PTX-NEXT:    ret;
   %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32
   %load = load <4 x double>, ptr addrspace(1) %in.offset, !invariant.load !0
@@ -364,14 +364,14 @@ define void @areg_64_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 define void @areg_64_float(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; PTX-LABEL: areg_64_float(
 ; PTX:       {
-; PTX-NEXT:    .reg .b32 %f<9>;
+; PTX-NEXT:    .reg .b32 %r<9>;
 ; PTX-NEXT:    .reg .b64 %rd<3>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
 ; PTX-NEXT:    ld.param.b64 %rd1, [areg_64_float_param_0];
-; PTX-NEXT:    ld.global.nc.v8.b32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [%rd1];
+; PTX-NEXT:    ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
 ; PTX-NEXT:    ld.param.b64 %rd2, [areg_64_float_param_1];
-; PTX-NEXT:    st.global.v8.b32 [%rd2], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8};
+; PTX-NEXT:    st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
 ; PTX-NEXT:    ret;
   %load = load <8 x float>, ptr addrspace(1) %in, !invariant.load !0
   store <8 x float> %load, ptr addrspace(1) %out
@@ -381,14 +381,13 @@ define void @areg_64_float(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 define void @areg_64_double(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; PTX-LABEL: areg_64_double(
 ; PTX:       {
-; PTX-NEXT:    .reg .b64 %rd<3>;
-; PTX-NEXT:    .reg .b64 %fd<5>;
+; PTX-NEXT:    .reg .b64 %rd<7>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
 ; PTX-NEXT:    ld.param.b64 %rd1, [areg_64_double_param_0];
-; PTX-NEXT:    ld.global.nc.v4.b64 {%fd1, %fd2, %fd3, %fd4}, [%rd1];
-; PTX-NEXT:    ld.param.b64 %rd2, [areg_64_double_param_1];
-; PTX-NEXT:    st.global.v4.b64 [%rd2], {%fd1, %fd2, %fd3, %fd4};
+; PTX-NEXT:    ld.global.nc.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1];
+; PTX-NEXT:    ld.param.b64 %rd6, [areg_64_double_param_1];
+; PTX-NEXT:    st.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5};
 ; PTX-NEXT:    ret;
   %load = load <4 x double>, ptr addrspace(1) %in, !invariant.load !0
   store <4 x double> %load, ptr addrspace(1) %out
@@ -511,14 +510,14 @@ define void @ari_64_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 define void @ari_64_float(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; PTX-LABEL: ari_64_float(
 ; PTX:       {
-; PTX-NEXT:    .reg .b32 %f<9>;
+; PTX-NEXT:    .reg .b32 %r<9>;
 ; PTX-NEXT:    .reg .b64 %rd<3>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
 ; PTX-NEXT:    ld.param.b64 %rd1, [ari_64_float_param_0];
 ; PTX-NEXT:    ld.param.b64 %rd2, [ari_64_float_param_1];
-; PTX-NEXT:    ld.global.nc.v8.b32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [%rd1+32];
-; PTX-NEXT:    st.global.v8.b32 [%rd2+32], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8};
+; PTX-NEXT:    ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32];
+; PTX-NEXT:    st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
 ; PTX-NEXT:    ret;
   %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32
   %load = load <8 x float>, ptr addrspace(1) %in.offset, !invariant.load !0
@@ -530,14 +529,13 @@ define void @ari_64_float(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 define void @ari_64_double(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; PTX-LABEL: ari_64_double(
 ; PTX:       {
-; PTX-NEXT:    .reg .b64 %rd<3>;
-; PTX-NEXT:    .reg .b64 %fd<5>;
+; PTX-NEXT:    .reg .b64 %rd<7>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
 ; PTX-NEXT:    ld.param.b64 %rd1, [ari_64_double_param_0];
 ; PTX-NEXT:    ld.param.b64 %rd2, [ari_64_double_param_1];
-; PTX-NEXT:    ld.global.nc.v4.b64 {%fd1, %fd2, %fd3, %fd4}, [%rd1+32];
-; PTX-NEXT:    st.global.v4.b64 [%rd2+32], {%fd1, %fd2, %fd3, %fd4};
+; PTX-NEXT:    ld.global.nc.v4.b64 {%rd3, %rd4, %rd5, %rd6}, [%rd1+32];
+; PTX-NEXT:    st.global.v4.b64 [%rd2+32], {%rd3, %rd4, %rd5, %rd6};
 ; PTX-NEXT:    ret;
   %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32
   %load = load <4 x double>, ptr addrspace(1) %in.offset, !invariant.load !0
diff --git a/llvm/test/CodeGen/NVPTX/load-store-256-addressing.ll b/llvm/test/CodeGen/NVPTX/load-store-256-addressing.ll
index 0e61478520abb..45e17016d8ee8 100644
--- a/llvm/test/CodeGen/NVPTX/load-store-256-addressing.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store-256-addressing.ll
@@ -106,11 +106,11 @@ define void @avar_i64() {
 define void @avar_float() {
 ; PTX-LABEL: avar_float(
 ; PTX:       {
-; PTX-NEXT:    .reg .b32 %f<9>;
+; PTX-NEXT:    .reg .b32 %r<9>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
-; PTX-NEXT:    ld.global.v8.b32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [globalin];
-; PTX-NEXT:    st.global.v8.b32 [globalout], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8};
+; PTX-NEXT:    ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin];
+; PTX-NEXT:    st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
 ; PTX-NEXT:    ret;
   %load = load <8 x float>, ptr addrspace(1) @globalin
   store <8 x float> %load, ptr addrspace(1) @globalout
@@ -120,11 +120,11 @@ define void @avar_float() {
 define void @avar_double() {
 ; PTX-LABEL: avar_double(
 ; PTX:       {
-; PTX-NEXT:    .reg .b64 %fd<5>;
+; PTX-NEXT:    .reg .b64 %rd<5>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
-; PTX-NEXT:    ld.global.v4.b64 {%fd1, %fd2, %fd3, %fd4}, [globalin];
-; PTX-NEXT:    st.global.v4.b64 [globalout], {%fd1, %fd2, %fd3, %fd4};
+; PTX-NEXT:    ld.global.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin];
+; PTX-NEXT:    st.global.v4.b64 [globalout], {%rd1, %rd2, %rd3, %rd4};
 ; PTX-NEXT:    ret;
   %load = load <4 x double>, ptr addrspace(1) @globalin
   store <4 x double> %load, ptr addrspace(1) @globalout
@@ -230,11 +230,11 @@ define void @asi_i64() {
 define void @asi_float() {
 ; PTX-LABEL: asi_float(
 ; PTX:       {
-; PTX-NEXT:    .reg .b32 %f<9>;
+; PTX-NEXT:    .reg .b32 %r<9>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
-; PTX-NEXT:    ld.global.v8.b32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [globalin+32];
-; PTX-NEXT:    st.global.v8.b32 [globalout+32], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8};
+; PTX-NEXT:    ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32];
+; PTX-NEXT:    st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
 ; PTX-NEXT:    ret;
   %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32
   %load = load <8 x float>, ptr addrspace(1) %in.offset
@@ -246,11 +246,11 @@ define void @asi_float() {
 define void @asi_double() {
 ; PTX-LABEL: asi_double(
 ; PTX:       {
-; PTX-NEXT:    .reg .b64 %fd<5>;
+; PTX-NEXT:    .reg .b64 %rd<5>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
-; PTX-NEXT:    ld.global.v4.b64 {%fd1, %fd2, %fd3, %fd4}, [globalin+32];
-; PTX-NEXT:    st.global.v4.b64 [globalout+32], {%fd1, %fd2, %fd3, %fd4};
+; PTX-NEXT:    ld.global.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin+32];
+; PTX-NEXT:    st.global.v4.b64 [globalout+32], {%rd1, %rd2, %rd3, %rd4};
 ; PTX-NEXT:    ret;
   %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32
   %load = load <4 x double>, ptr addrspace(1) %in.offset
@@ -360,14 +360,14 @@ define void @areg_64_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 define void @areg_64_float(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; PTX-LABEL: areg_64_float(
 ; PTX:       {
-; PTX-NEXT:    .reg .b32 %f<9>;
+; PTX-NEXT:    .reg .b32 %r<9>;
 ; PTX-NEXT:    .reg .b64 %rd<3>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
 ; PTX-NEXT:    ld.param.b64 %rd1, [areg_64_float_param_0];
-; PTX-NEXT:    ld.global.v8.b32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [%rd1];
+; PTX-NEXT:    ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
 ; PTX-NEXT:    ld.param.b64 %rd2, [areg_64_float_param_1];
-; PTX-NEXT:    st.global.v8.b32 [%rd2], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8};
+; PTX-NEXT:    st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
 ; PTX-NEXT:    ret;
   %load = load <8 x float>, ptr addrspace(1) %in
   store <8 x float> %load, ptr addrspace(1) %out
@@ -377,14 +377,13 @@ define void @areg_64_float(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 define void @areg_64_double(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; PTX-LABEL: areg_64_double(
 ; PTX:       {
-; PTX-NEXT:    .reg .b64 %rd<3>;
-; PTX-NEXT:    .reg .b64 %fd<5>;
+; PTX-NEXT:    .reg .b64 %rd<7>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
 ; PTX-NEXT:    ld.param.b64 %rd1, [areg_64_double_param_0];
-; PTX-NEXT:    ld.global.v4.b64 {%fd1, %fd2, %fd3, %fd4}, [%rd1];
-; PTX-NEXT:    ld.param.b64 %rd2, [areg_64_double_param_1];
-; PTX-NEXT:    st.global.v4.b64 [%rd2], {%fd1, %fd2, %fd3, %fd4};
+; PTX-NEXT:    ld.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1];
+; PTX-NEXT:    ld.param.b64 %rd6, [areg_64_double_param_1];
+; PTX-NEXT:    st.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5};
 ; PTX-NEXT:    ret;
   %load = load <4 x double>, ptr addrspace(1) %in
   store <4 x double> %load, ptr addrspace(1) %out
@@ -507,14 +506,14 @@ define void @ari_64_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 define void @ari_64_float(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; PTX-LABEL: ari_64_float(
 ; PTX:       {
-; PTX-NEXT:    .reg .b32 %f<9>;
+; PTX-NEXT:    .reg .b32 %r<9>;
 ; PTX-NEXT:    .reg .b64 %rd<3>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
 ; PTX-NEXT:    ld.param.b64 %rd1, [ari_64_float_param_0];
 ; PTX-NEXT:    ld.param.b64 %rd2, [ari_64_float_param_1];
-; PTX-NEXT:    ld.global.v8.b32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [%rd1+32];
-; PTX-NEXT:    st.global.v8.b32 [%rd2+32], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8};
+; PTX-NEXT:    ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32];
+; PTX-NEXT:    st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
 ; PTX-NEXT:    ret;
   %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32
   %load = load <8 x float>, ptr addrspace(1) %in.offset
@@ -526,14 +525,13 @@ define void @ari_64_float(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 define void @ari_64_double(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; PTX-LABEL: ari_64_double(
 ; PTX:       {
-; PTX-NEXT:    .reg .b64 %rd<3>;
-; PTX-NEXT:    .reg .b64 %fd<5>;
+; PTX-NEXT:    .reg .b64 %rd<7>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
 ; PTX-NEXT:    ld.param.b64 %rd1, [ari_64_double_param_0];
 ; PTX-NEXT:    ld.param.b64 %rd2, [ari_64_double_param_1];
-; PTX-NEXT:    ld.global.v4.b64 {%fd1, %fd2, %fd3, %fd4}, [%rd1+32];
-; PTX-NEXT:    st.global.v4.b64 [%rd2+32], {%fd1, %fd2, %fd3, %fd4};
+; PTX-NEXT:    ld.global.v4.b64 {%rd3, %rd4, %rd5, %rd6}, [%rd1+32];
+; PTX-NEXT:    st.global.v4.b64 [%rd2+32], {%rd3, %rd4, %rd5, %rd6};
 ; PTX-NEXT:    ret;
   %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32
   %load = load <4 x double>, ptr addrspace(1) %in.offset
diff --git a/llvm/test/CodeGen/NVPTX/load-store-scalars.ll b/llvm/test/CodeGen/NVPTX/load-store-scalars.ll
index 468e19492bfd5..bac59be5158ea 100644
--- a/llvm/test/CodeGen/NVPTX/load-store-scalars.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store-scalars.ll
@@ -91,14 +91,14 @@ define void @generic_i64(ptr %a) {
 define void @generic_float(ptr %a) {
 ; CHECK-LABEL: generic_float(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [generic_float_param_0];
-; CHECK-NEXT:    ld.b32 %f1, [%rd1];
-; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; CHECK-NEXT:    st.b32 [%rd1], %f2;
+; CHECK-NEXT:    ld.b32 %r1, [%rd1];
+; CHECK-NEXT:    add.rn.f32 %r2, %r1, 0f3F800000;
+; CHECK-NEXT:    st.b32 [%rd1], %r2;
 ; CHECK-NEXT:    ret;
   %a.load = load float, ptr %a
   %a.add = fadd float %a.load, 1.
@@ -109,14 +109,13 @@ define void @generic_float(ptr %a) {
 define void @generic_double(ptr %a) {
 ; CHECK-LABEL: generic_double(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<2>;
-; CHECK-NEXT:    .reg .b64 %fd<3>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [generic_double_param_0];
-; CHECK-NEXT:    ld.b64 %fd1, [%rd1];
-; CHECK-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT:    st.b64 [%rd1], %fd2;
+; CHECK-NEXT:    ld.b64 %rd2, [%rd1];
+; CHECK-NEXT:    add.rn.f64 %rd3, %rd2, 0d3FF0000000000000;
+; CHECK-NEXT:    st.b64 [%rd1], %rd3;
 ; CHECK-NEXT:    ret;
   %a.load = load double, ptr %a
   %a.add = fadd double %a.load, 1.
@@ -200,14 +199,14 @@ define void @generic_volatile_i64(ptr %a) {
 define void @generic_volatile_float(ptr %a) {
 ; CHECK-LABEL: generic_volatile_float(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [generic_volatile_float_param_0];
-; CHECK-NEXT:    ld.volatile.b32 %f1, [%rd1];
-; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; CHECK-NEXT:    st.volatile.b32 [%rd1], %f2;
+; CHECK-NEXT:    ld.volatile.b32 %r1, [%rd1];
+; CHECK-NEXT:    add.rn.f32 %r2, %r1, 0f3F800000;
+; CHECK-NEXT:    st.volatile.b32 [%rd1], %r2;
 ; CHECK-NEXT:    ret;
   %a.load = load volatile float, ptr %a
   %a.add = fadd float %a.load, 1.
@@ -218,14 +217,13 @@ define void @generic_volatile_float(ptr %a) {
 define void @generic_volatile_double(ptr %a) {
 ; CHECK-LABEL: generic_volatile_double(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<2>;
-; CHECK-NEXT:    .reg .b64 %fd<3>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [generic_volatile_double_param_0];
-; CHECK-NEXT:    ld.volatile.b64 %fd1, [%rd1];
-; CHECK-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT:    st.volatile.b64 [%rd1], %fd2;
+; CHECK-NEXT:    ld.volatile.b64 %rd2, [%rd1];
+; CHECK-NEXT:    add.rn.f64 %rd3, %rd2, 0d3FF0000000000000;
+; CHECK-NEXT:    st.volatile.b64 [%rd1], %rd3;
 ; CHECK-NEXT:    ret;
   %a.load = load volatile double, ptr %a
   %a.add = fadd double %a.load, 1.
@@ -356,26 +354,26 @@ define void @generic_unordered_sys_i64(ptr %a) {
 define void @generic_unordered_sys_float(ptr %a) {
 ; SM60-LABEL: generic_unordered_sys_float(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %f<3>;
+; SM60-NEXT:    .reg .b32 %r<3>;
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
 ; SM60-NEXT:    ld.param.b64 %rd1, [generic_unordered_sys_float_param_0];
-; SM60-NEXT:    ld.volatile.b32 %f1, [%rd1];
-; SM60-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; SM60-NEXT:    st.volatile.b32 [%rd1], %f2;
+; SM60-NEXT:    ld.volatile.b32 %r1, [%rd1];
+; SM60-NEXT:    add.rn.f32 %r2, %r1, 0f3F800000;
+; SM60-NEXT:    st.volatile.b32 [%rd1], %r2;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: generic_unordered_sys_float(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %f<3>;
+; SM70-NEXT:    .reg .b32 %r<3>;
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b64 %rd1, [generic_unordered_sys_float_param_0];
-; SM70-NEXT:    ld.relaxed.sys.b32 %f1, [%rd1];
-; SM70-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; SM70-NEXT:    st.relaxed.sys.b32 [%rd1], %f2;
+; SM70-NEXT:    ld.relaxed.sys.b32 %r1, [%rd1];
+; SM70-NEXT:    add.rn.f32 %r2, %r1, 0f3F800000;
+; SM70-NEXT:    st.relaxed.sys.b32 [%rd1], %r2;
 ; SM70-NEXT:    ret;
   %a.load = load atomic float, ptr %a unordered, align 4
   %a.add = fadd float %a.load, 1.
@@ -386,26 +384,24 @@ define void @generic_unordered_sys_float(ptr %a) {
 define void @generic_unordered_sys_double(ptr %a) {
 ; SM60-LABEL: generic_unordered_sys_double(
 ; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-NEXT:    .reg .b64 %fd<3>;
+; SM60-NEXT:    .reg .b64 %rd<4>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
 ; SM60-NEXT:    ld.param.b64 %rd1, [generic_unordered_sys_double_param_0];
-; SM60-NEXT:    ld.volatile.b64 %fd1, [%rd1];
-; SM60-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; SM60-NEXT:    st.volatile.b64 [%rd1], %fd2;
+; SM60-NEXT:    ld.volatile.b64 %rd2, [%rd1];
+; SM60-NEXT:    add.rn.f64 %rd3, %rd2, 0d3FF0000000000000;
+; SM60-NEXT:    st.volatile.b64 [%rd1], %rd3;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: generic_unordered_sys_double(
 ; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-NEXT:    .reg .b64 %fd<3>;
+; SM70-NEXT:    .reg .b64 %rd<4>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b64 %rd1, [generic_unordered_sys_double_param_0];
-; SM70-NEXT:    ld.relaxed.sys.b64 %fd1, [%rd1];
-; SM70-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; SM70-NEXT:    st.relaxed.sys.b64 [%rd1], %fd2;
+; SM70-NEXT:    ld.relaxed.sys.b64 %rd2, [%rd1];
+; SM70-NEXT:    add.rn.f64 %rd3, %rd2, 0d3FF0000000000000;
+; SM70-NEXT:    st.relaxed.sys.b64 [%rd1], %rd3;
 ; SM70-NEXT:    ret;
   %a.load = load atomic double, ptr %a unordered, align 8
   %a.add = fadd double %a.load, 1.
@@ -489,14 +485,14 @@ define void @generic_unordered_volatile_sys_i64(ptr %a) {
 define void @generic_unordered_volatile_sys_float(ptr %a) {
 ; CHECK-LABEL: generic_unordered_volatile_sys_float(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [generic_unordered_volatile_sys_float_param_0];
-; CHECK-NEXT:    ld.volatile.b32 %f1, [%rd1];
-; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; CHECK-NEXT:    st.volatile.b32 [%rd1], %f2;
+; CHECK-NEXT:    ld.volatile.b32 %r1, [%rd1];
+; CHECK-NEXT:    add.rn.f32 %r2, %r1, 0f3F800000;
+; CHECK-NEXT:    st.volatile.b32 [%rd1], %r2;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic volatile float, ptr %a unordered, align 4
   %a.add = fadd float %a.load, 1.
@@ -507,14 +503,13 @@ define void @generic_unordered_volatile_sys_float(ptr %a) {
 define void @generic_unordered_volatile_sys_double(ptr %a) {
 ; CHECK-LABEL: generic_unordered_volatile_sys_double(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<2>;
-; CHECK-NEXT:    .reg .b64 %fd<3>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [generic_unordered_volatile_sys_double_param_0];
-; CHECK-NEXT:    ld.volatile.b64 %fd1, [%rd1];
-; CHECK-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT:    st.volatile.b64 [%rd1], %fd2;
+; CHECK-NEXT:    ld.volatile.b64 %rd2, [%rd1];
+; CHECK-NEXT:    add.rn.f64 %rd3, %rd2, 0d3FF0000000000000;
+; CHECK-NEXT:    st.volatile.b64 [%rd1], %rd3;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic volatile double, ptr %a unordered, align 8
   %a.add = fadd double %a.load, 1.
@@ -645,26 +640,26 @@ define void @generic_monotonic_sys_i64(ptr %a) {
 define void @generic_monotonic_sys_float(ptr %a) {
 ; SM60-LABEL: generic_monotonic_sys_float(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %f<3>;
+; SM60-NEXT:    .reg .b32 %r<3>;
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
 ; SM60-NEXT:    ld.param.b64 %rd1, [generic_monotonic_sys_float_param_0];
-; SM60-NEXT:    ld.volatile.b32 %f1, [%rd1];
-; SM60-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; SM60-NEXT:    st.volatile.b32 [%rd1], %f2;
+; SM60-NEXT:    ld.volatile.b32 %r1, [%rd1];
+; SM60-NEXT:    add.rn.f32 %r2, %r1, 0f3F800000;
+; SM60-NEXT:    st.volatile.b32 [%rd1], %r2;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: generic_monotonic_sys_float(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %f<3>;
+; SM70-NEXT:    .reg .b32 %r<3>;
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b64 %rd1, [generic_monotonic_sys_float_param_0];
-; SM70-NEXT:    ld.relaxed.sys.b32 %f1, [%rd1];
-; SM70-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; SM70-NEXT:    st.relaxed.sys.b32 [%rd1], %f2;
+; SM70-NEXT:    ld.relaxed.sys.b32 %r1, [%rd1];
+; SM70-NEXT:    add.rn.f32 %r2, %r1, 0f3F800000;
+; SM70-NEXT:    st.relaxed.sys.b32 [%rd1], %r2;
 ; SM70-NEXT:    ret;
   %a.load = load atomic float, ptr %a monotonic, align 4
   %a.add = fadd float %a.load, 1.
@@ -675,26 +670,24 @@ define void @generic_monotonic_sys_float(ptr %a) {
 define void @generic_monotonic_sys_double(ptr %a) {
 ; SM60-LABEL: generic_monotonic_sys_double(
 ; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-NEXT:    .reg .b64 %fd<3>;
+; SM60-NEXT:    .reg .b64 %rd<4>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
 ; SM60-NEXT:    ld.param.b64 %rd1, [generic_monotonic_sys_double_param_0];
-; SM60-NEXT:    ld.volatile.b64 %fd1, [%rd1];
-; SM60-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; SM60-NEXT:    st.volatile.b64 [%rd1], %fd2;
+; SM60-NEXT:    ld.volatile.b64 %rd2, [%rd1];
+; SM60-NEXT:    add.rn.f64 %rd3, %rd2, 0d3FF0000000000000;
+; SM60-NEXT:    st.volatile.b64 [%rd1], %rd3;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: generic_monotonic_sys_double(
 ; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-NEXT:    .reg .b64 %fd<3>;
+; SM70-NEXT:    .reg .b64 %rd<4>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b64 %rd1, [generic_monotonic_sys_double_param_0];
-; SM70-NEXT:    ld.relaxed.sys.b64 %fd1, [%rd1];
-; SM70-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; SM70-NEXT:    st.relaxed.sys.b64 [%rd1], %fd2;
+; SM70-NEXT:    ld.relaxed.sys.b64 %rd2, [%rd1];
+; SM70-NEXT:    add.rn.f64 %rd3, %rd2, 0d3FF0000000000000;
+; SM70-NEXT:    st.relaxed.sys.b64 [%rd1], %rd3;
 ; SM70-NEXT:    ret;
   %a.load = load atomic double, ptr %a monotonic, align 8
   %a.add = fadd double %a.load, 1.
@@ -778,14 +771,14 @@ define void @generic_monotonic_volatile_sys_i64(ptr %a) {
 define void @generic_monotonic_volatile_sys_float(ptr %a) {
 ; CHECK-LABEL: generic_monotonic_volatile_sys_float(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [generic_monotonic_volatile_sys_float_param_0];
-; CHECK-NEXT:    ld.volatile.b32 %f1, [%rd1];
-; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; CHECK-NEXT:    st.volatile.b32 [%rd1], %f2;
+; CHECK-NEXT:    ld.volatile.b32 %r1, [%rd1];
+; CHECK-NEXT:    add.rn.f32 %r2, %r1, 0f3F800000;
+; CHECK-NEXT:    st.volatile.b32 [%rd1], %r2;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic volatile float, ptr %a monotonic, align 4
   %a.add = fadd float %a.load, 1.
@@ -796,14 +789,13 @@ define void @generic_monotonic_volatile_sys_float(ptr %a) {
 define void @generic_monotonic_volatile_sys_double(ptr %a) {
 ; CHECK-LABEL: generic_monotonic_volatile_sys_double(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<2>;
-; CHECK-NEXT:    .reg .b64 %fd<3>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [generic_monotonic_volatile_sys_double_param_0];
-; CHECK-NEXT:    ld.volatile.b64 %fd1, [%rd1];
-; CHECK-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT:    st.volatile.b64 [%rd1], %fd2;
+; CHECK-NEXT:    ld.volatile.b64 %rd2, [%rd1];
+; CHECK-NEXT:    add.rn.f64 %rd3, %rd2, 0d3FF0000000000000;
+; CHECK-NEXT:    st.volatile.b64 [%rd1], %rd3;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic volatile double, ptr %a monotonic, align 8
   %a.add = fadd double %a.load, 1.
@@ -889,14 +881,14 @@ define void @global_i64(ptr addrspace(1) %a) {
 define void @global_float(ptr addrspace(1) %a) {
 ; CHECK-LABEL: global_float(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [global_float_param_0];
-; CHECK-NEXT:    ld.global.b32 %f1, [%rd1];
-; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; CHECK-NEXT:    st.global.b32 [%rd1], %f2;
+; CHECK-NEXT:    ld.global.b32 %r1, [%rd1];
+; CHECK-NEXT:    add.rn.f32 %r2, %r1, 0f3F800000;
+; CHECK-NEXT:    st.global.b32 [%rd1], %r2;
 ; CHECK-NEXT:    ret;
   %a.load = load float, ptr addrspace(1) %a
   %a.add = fadd float %a.load, 1.
@@ -907,14 +899,13 @@ define void @global_float(ptr addrspace(1) %a) {
 define void @global_double(ptr addrspace(1) %a) {
 ; CHECK-LABEL: global_double(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<2>;
-; CHECK-NEXT:    .reg .b64 %fd<3>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [global_double_param_0];
-; CHECK-NEXT:    ld.global.b64 %fd1, [%rd1];
-; CHECK-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT:    st.global.b64 [%rd1], %fd2;
+; CHECK-NEXT:    ld.global.b64 %rd2, [%rd1];
+; CHECK-NEXT:    add.rn.f64 %rd3, %rd2, 0d3FF0000000000000;
+; CHECK-NEXT:    st.global.b64 [%rd1], %rd3;
 ; CHECK-NEXT:    ret;
   %a.load = load double, ptr addrspace(1) %a
   %a.add = fadd double %a.load, 1.
@@ -998,14 +989,14 @@ define void @global_volatile_i64(ptr addrspace(1) %a) {
 define void @global_volatile_float(ptr addrspace(1) %a) {
 ; CHECK-LABEL: global_volatile_float(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [global_volatile_float_param_0];
-; CHECK-NEXT:    ld.volatile.global.b32 %f1, [%rd1];
-; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; CHECK-NEXT:    st.volatile.global.b32 [%rd1], %f2;
+; CHECK-NEXT:    ld.volatile.global.b32 %r1, [%rd1];
+; CHECK-NEXT:    add.rn.f32 %r2, %r1, 0f3F800000;
+; CHECK-NEXT:    st.volatile.global.b32 [%rd1], %r2;
 ; CHECK-NEXT:    ret;
   %a.load = load volatile float, ptr addrspace(1) %a
   %a.add = fadd float %a.load, 1.
@@ -1016,14 +1007,13 @@ define void @global_volatile_float(ptr addrspace(1) %a) {
 define void @global_volatile_double(ptr addrspace(1) %a) {
 ; CHECK-LABEL: global_volatile_double(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<2>;
-; CHECK-NEXT:    .reg .b64 %fd<3>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [global_volatile_double_param_0];
-; CHECK-NEXT:    ld.volatile.global.b64 %fd1, [%rd1];
-; CHECK-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT:    st.volatile.global.b64 [%rd1], %fd2;
+; CHECK-NEXT:    ld.volatile.global.b64 %rd2, [%rd1];
+; CHECK-NEXT:    add.rn.f64 %rd3, %rd2, 0d3FF0000000000000;
+; CHECK-NEXT:    st.volatile.global.b64 [%rd1], %rd3;
 ; CHECK-NEXT:    ret;
   %a.load = load volatile double, ptr addrspace(1) %a
   %a.add = fadd double %a.load, 1.
@@ -1154,26 +1144,26 @@ define void @global_unordered_sys_i64(ptr addrspace(1) %a) {
 define void @global_unordered_sys_float(ptr addrspace(1) %a) {
 ; SM60-LABEL: global_unordered_sys_float(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %f<3>;
+; SM60-NEXT:    .reg .b32 %r<3>;
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
 ; SM60-NEXT:    ld.param.b64 %rd1, [global_unordered_sys_float_param_0];
-; SM60-NEXT:    ld.volatile.global.b32 %f1, [%rd1];
-; SM60-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; SM60-NEXT:    st.volatile.global.b32 [%rd1], %f2;
+; SM60-NEXT:    ld.volatile.global.b32 %r1, [%rd1];
+; SM60-NEXT:    add.rn.f32 %r2, %r1, 0f3F800000;
+; SM60-NEXT:    st.volatile.global.b32 [%rd1], %r2;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: global_unordered_sys_float(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %f<3>;
+; SM70-NEXT:    .reg .b32 %r<3>;
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b64 %rd1, [global_unordered_sys_float_param_0];
-; SM70-NEXT:    ld.relaxed.sys.global.b32 %f1, [%rd1];
-; SM70-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; SM70-NEXT:    st.relaxed.sys.global.b32 [%rd1], %f2;
+; SM70-NEXT:    ld.relaxed.sys.global.b32 %r1, [%rd1];
+; SM70-NEXT:    add.rn.f32 %r2, %r1, 0f3F800000;
+; SM70-NEXT:    st.relaxed.sys.global.b32 [%rd1], %r2;
 ; SM70-NEXT:    ret;
   %a.load = load atomic float, ptr addrspace(1) %a unordered, align 4
   %a.add = fadd float %a.load, 1.
@@ -1184,26 +1174,24 @@ define void @global_unordered_sys_float(ptr addrspace(1) %a) {
 define void @global_unordered_sys_double(ptr addrspace(1) %a) {
 ; SM60-LABEL: global_unordered_sys_double(
 ; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-NEXT:    .reg .b64 %fd<3>;
+; SM60-NEXT:    .reg .b64 %rd<4>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
 ; SM60-NEXT:    ld.param.b64 %rd1, [global_unordered_sys_double_param_0];
-; SM60-NEXT:    ld.volatile.global.b64 %fd1, [%rd1];
-; SM60-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; SM60-NEXT:    st.volatile.global.b64 [%rd1], %fd2;
+; SM60-NEXT:    ld.volatile.global.b64 %rd2, [%rd1];
+; SM60-NEXT:    add.rn.f64 %rd3, %rd2, 0d3FF0000000000000;
+; SM60-NEXT:    st.volatile.global.b64 [%rd1], %rd3;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: global_unordered_sys_double(
 ; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-NEXT:    .reg .b64 %fd<3>;
+; SM70-NEXT:    .reg .b64 %rd<4>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b64 %rd1, [global_unordered_sys_double_param_0];
-; SM70-NEXT:    ld.relaxed.sys.global.b64 %fd1, [%rd1];
-; SM70-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; SM70-NEXT:    st.relaxed.sys.global.b64 [%rd1], %fd2;
+; SM70-NEXT:    ld.relaxed.sys.global.b64 %rd2, [%rd1];
+; SM70-NEXT:    add.rn.f64 %rd3, %rd2, 0d3FF0000000000000;
+; SM70-NEXT:    st.relaxed.sys.global.b64 [%rd1], %rd3;
 ; SM70-NEXT:    ret;
   %a.load = load atomic double, ptr addrspace(1) %a unordered, align 8
   %a.add = fadd double %a.load, 1.
@@ -1334,26 +1322,26 @@ define void @global_unordered_volatile_sys_i64(ptr addrspace(1) %a) {
 define void @global_unordered_volatile_sys_float(ptr addrspace(1) %a) {
 ; SM60-LABEL: global_unordered_volatile_sys_float(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %f<3>;
+; SM60-NEXT:    .reg .b32 %r<3>;
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
 ; SM60-NEXT:    ld.param.b64 %rd1, [global_unordered_volatile_sys_float_param_0];
-; SM60-NEXT:    ld.volatile.global.b32 %f1, [%rd1];
-; SM60-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; SM60-NEXT:    st.volatile.global.b32 [%rd1], %f2;
+; SM60-NEXT:    ld.volatile.global.b32 %r1, [%rd1];
+; SM60-NEXT:    add.rn.f32 %r2, %r1, 0f3F800000;
+; SM60-NEXT:    st.volatile.global.b32 [%rd1], %r2;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: global_unordered_volatile_sys_float(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %f<3>;
+; SM70-NEXT:    .reg .b32 %r<3>;
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b64 %rd1, [global_unordered_volatile_sys_float_param_0];
-; SM70-NEXT:    ld.mmio.relaxed.sys.global.b32 %f1, [%rd1];
-; SM70-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; SM70-NEXT:    st.mmio.relaxed.sys.global.b32 [%rd1], %f2;
+; SM70-NEXT:    ld.mmio.relaxed.sys.global.b32 %r1, [%rd1];
+; SM70-NEXT:    add.rn.f32 %r2, %r1, 0f3F800000;
+; SM70-NEXT:    st.mmio.relaxed.sys.global.b32 [%rd1], %r2;
 ; SM70-NEXT:    ret;
   %a.load = load atomic volatile float, ptr addrspace(1) %a unordered, align 4
   %a.add = fadd float %a.load, 1.
@@ -1364,26 +1352,24 @@ define void @global_unordered_volatile_sys_float(ptr addrspace(1) %a) {
 define void @global_unordered_volatile_sys_double(ptr addrspace(1) %a) {
 ; SM60-LABEL: global_unordered_volatile_sys_double(
 ; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-NEXT:    .reg .b64 %fd<3>;
+; SM60-NEXT:    .reg .b64 %rd<4>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
 ; SM60-NEXT:    ld.param.b64 %rd1, [global_unordered_volatile_sys_double_param_0];
-; SM60-NEXT:    ld.volatile.global.b64 %fd1, [%rd1];
-; SM60-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; SM60-NEXT:    st.volatile.global.b64 [%rd1], %fd2;
+; SM60-NEXT:    ld.volatile.global.b64 %rd2, [%rd1];
+; SM60-NEXT:    add.rn.f64 %rd3, %rd2, 0d3FF0000000000000;
+; SM60-NEXT:    st.volatile.global.b64 [%rd1], %rd3;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: global_unordered_volatile_sys_double(
 ; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-NEXT:    .reg .b64 %fd<3>;
+; SM70-NEXT:    .reg .b64 %rd<4>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b64 %rd1, [global_unordered_volatile_sys_double_param_0];
-; SM70-NEXT:    ld.mmio.relaxed.sys.global.b64 %fd1, [%rd1];
-; SM70-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; SM70-NEXT:    st.mmio.relaxed.sys.global.b64 [%rd1], %fd2;
+; SM70-NEXT:    ld.mmio.relaxed.sys.global.b64 %rd2, [%rd1];
+; SM70-NEXT:    add.rn.f64 %rd3, %rd2, 0d3FF0000000000000;
+; SM70-NEXT:    st.mmio.relaxed.sys.global.b64 [%rd1], %rd3;
 ; SM70-NEXT:    ret;
   %a.load = load atomic volatile double, ptr addrspace(1) %a unordered, align 8
   %a.add = fadd double %a.load, 1.
@@ -1514,26 +1500,26 @@ define void @global_monotonic_sys_i64(ptr addrspace(1) %a) {
 define void @global_monotonic_sys_float(ptr addrspace(1) %a) {
 ; SM60-LABEL: global_monotonic_sys_float(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %f<3>;
+; SM60-NEXT:    .reg .b32 %r<3>;
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
 ; SM60-NEXT:    ld.param.b64 %rd1, [global_monotonic_sys_float_param_0];
-; SM60-NEXT:    ld.volatile.global.b32 %f1, [%rd1];
-; SM60-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; SM60-NEXT:    st.volatile.global.b32 [%rd1], %f2;
+; SM60-NEXT:    ld.volatile.global.b32 %r1, [%rd1];
+; SM60-NEXT:    add.rn.f32 %r2, %r1, 0f3F800000;
+; SM60-NEXT:    st.volatile.global.b32 [%rd1], %r2;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: global_monotonic_sys_float(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %f<3>;
+; SM70-NEXT:    .reg .b32 %r<3>;
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b64 %rd1, [global_monotonic_sys_float_param_0];
-; SM70-NEXT:    ld.relaxed.sys.global.b32 %f1, [%rd1];
-; SM70-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; SM70-NEXT:    st.relaxed.sys.global.b32 [%rd1], %f2;
+; SM70-NEXT:    ld.relaxed.sys.global.b32 %r1, [%rd1];
+; SM70-NEXT:    add.rn.f32 %r2, %r1, 0f3F800000;
+; SM70-NEXT:    st.relaxed.sys.global.b32 [%rd1], %r2;
 ; SM70-NEXT:    ret;
   %a.load = load atomic float, ptr addrspace(1) %a monotonic, align 4
   %a.add = fadd float %a.load, 1.
@@ -1544,26 +1530,24 @@ define void @global_monotonic_sys_float(ptr addrspace(1) %a) {
 define void @global_monotonic_sys_double(ptr addrspace(1) %a) {
 ; SM60-LABEL: global_monotonic_sys_double(
 ; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-NEXT:    .reg .b64 %fd<3>;
+; SM60-NEXT:    .reg .b64 %rd<4>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
 ; SM60-NEXT:    ld.param.b64 %rd1, [global_monotonic_sys_double_param_0];
-; SM60-NEXT:    ld.volatile.global.b64 %fd1, [%rd1];
-; SM60-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; SM60-NEXT:    st.volatile.global.b64 [%rd1], %fd2;
+; SM60-NEXT:    ld.volatile.global.b64 %rd2, [%rd1];
+; SM60-NEXT:    add.rn.f64 %rd3, %rd2, 0d3FF0000000000000;
+; SM60-NEXT:    st.volatile.global.b64 [%rd1], %rd3;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: global_monotonic_sys_double(
 ; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-NEXT:    .reg .b64 %fd<3>;
+; SM70-NEXT:    .reg .b64 %rd<4>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b64 %rd1, [global_monotonic_sys_double_param_0];
-; SM70-NEXT:    ld.relaxed.sys.global.b64 %fd1, [%rd1];
-; SM70-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; SM70-NEXT:    st.relaxed.sys.global.b64 [%rd1], %fd2;
+; SM70-NEXT:    ld.relaxed.sys.global.b64 %rd2, [%rd1];
+; SM70-NEXT:    add.rn.f64 %rd3, %rd2, 0d3FF0000000000000;
+; SM70-NEXT:    st.relaxed.sys.global.b64 [%rd1], %rd3;
 ; SM70-NEXT:    ret;
   %a.load = load atomic double, ptr addrspace(1) %a monotonic, align 8
   %a.add = fadd double %a.load, 1.
@@ -1694,26 +1678,26 @@ define void @global_monotonic_volatile_sys_i64(ptr addrspace(1) %a) {
 define void @global_monotonic_volatile_sys_float(ptr addrspace(1) %a) {
 ; SM60-LABEL: global_monotonic_volatile_sys_float(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %f<3>;
+; SM60-NEXT:    .reg .b32 %r<3>;
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
 ; SM60-NEXT:    ld.param.b64 %rd1, [global_monotonic_volatile_sys_float_param_0];
-; SM60-NEXT:    ld.volatile.global.b32 %f1, [%rd1];
-; SM60-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; SM60-NEXT:    st.volatile.global.b32 [%rd1], %f2;
+; SM60-NEXT:    ld.volatile.global.b32 %r1, [%rd1];
+; SM60-NEXT:    add.rn.f32 %r2, %r1, 0f3F800000;
+; SM60-NEXT:    st.volatile.global.b32 [%rd1], %r2;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: global_monotonic_volatile_sys_float(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %f<3>;
+; SM70-NEXT:    .reg .b32 %r<3>;
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b64 %rd1, [global_monotonic_volatile_sys_float_param_0];
-; SM70-NEXT:    ld.mmio.relaxed.sys.global.b32 %f1, [%rd1];
-; SM70-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; SM70-NEXT:    st.mmio.relaxed.sys.global.b32 [%rd1], %f2;
+; SM70-NEXT:    ld.mmio.relaxed.sys.global.b32 %r1, [%rd1];
+; SM70-NEXT:    add.rn.f32 %r2, %r1, 0f3F800000;
+; SM70-NEXT:    st.mmio.relaxed.sys.global.b32 [%rd1], %r2;
 ; SM70-NEXT:    ret;
   %a.load = load atomic volatile float, ptr addrspace(1) %a monotonic, align 4
   %a.add = fadd float %a.load, 1.
@@ -1724,26 +1708,24 @@ define void @global_monotonic_volatile_sys_float(ptr addrspace(1) %a) {
 define void @global_monotonic_volatile_sys_double(ptr addrspace(1) %a) {
 ; SM60-LABEL: global_monotonic_volatile_sys_double(
 ; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-NEXT:    .reg .b64 %fd<3>;
+; SM60-NEXT:    .reg .b64 %rd<4>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
 ; SM60-NEXT:    ld.param.b64 %rd1, [global_monotonic_volatile_sys_double_param_0];
-; SM60-NEXT:    ld.volatile.global.b64 %fd1, [%rd1];
-; SM60-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; SM60-NEXT:    st.volatile.global.b64 [%rd1], %fd2;
+; SM60-NEXT:    ld.volatile.global.b64 %rd2, [%rd1];
+; SM60-NEXT:    add.rn.f64 %rd3, %rd2, 0d3FF0000000000000;
+; SM60-NEXT:    st.volatile.global.b64 [%rd1], %rd3;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: global_monotonic_volatile_sys_double(
 ; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-NEXT:    .reg .b64 %fd<3>;
+; SM70-NEXT:    .reg .b64 %rd<4>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b64 %rd1, [global_monotonic_volatile_sys_double_param_0];
-; SM70-NEXT:    ld.mmio.relaxed.sys.global.b64 %fd1, [%rd1];
-; SM70-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; SM70-NEXT:    st.mmio.relaxed.sys.global.b64 [%rd1], %fd2;
+; SM70-NEXT:    ld.mmio.relaxed.sys.global.b64 %rd2, [%rd1];
+; SM70-NEXT:    add.rn.f64 %rd3, %rd2, 0d3FF0000000000000;
+; SM70-NEXT:    st.mmio.relaxed.sys.global.b64 [%rd1], %rd3;
 ; SM70-NEXT:    ret;
   %a.load = load atomic volatile double, ptr addrspace(1) %a monotonic, align 8
   %a.add = fadd double %a.load, 1.
@@ -1829,14 +1811,14 @@ define void @shared_i64(ptr addrspace(3) %a) {
 define void @shared_float(ptr addrspace(3) %a) {
 ; CHECK-LABEL: shared_float(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [shared_float_param_0];
-; CHECK-NEXT:    ld.shared.b32 %f1, [%rd1];
-; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; CHECK-NEXT:    st.shared.b32 [%rd1], %f2;
+; CHECK-NEXT:    ld.shared.b32 %r1, [%rd1];
+; CHECK-NEXT:    add.rn.f32 %r2, %r1, 0f3F800000;
+; CHECK-NEXT:    st.shared.b32 [%rd1], %r2;
 ; CHECK-NEXT:    ret;
   %a.load = load float, ptr addrspace(3) %a
   %a.add = fadd float %a.load, 1.
@@ -1847,14 +1829,13 @@ define void @shared_float(ptr addrspace(3) %a) {
 define void @shared_double(ptr addrspace(3) %a) {
 ; CHECK-LABEL: shared_double(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<2>;
-; CHECK-NEXT:    .reg .b64 %fd<3>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [shared_double_param_0];
-; CHECK-NEXT:    ld.shared.b64 %fd1, [%rd1];
-; CHECK-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT:    st.shared.b64 [%rd1], %fd2;
+; CHECK-NEXT:    ld.shared.b64 %rd2, [%rd1];
+; CHECK-NEXT:    add.rn.f64 %rd3, %rd2, 0d3FF0000000000000;
+; CHECK-NEXT:    st.shared.b64 [%rd1], %rd3;
 ; CHECK-NEXT:    ret;
   %a.load = load double, ptr addrspace(3) %a
   %a.add = fadd double %a.load, 1.
@@ -1938,14 +1919,14 @@ define void @shared_volatile_i64(ptr addrspace(3) %a) {
 define void @shared_volatile_float(ptr addrspace(3) %a) {
 ; CHECK-LABEL: shared_volatile_float(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [shared_volatile_float_param_0];
-; CHECK-NEXT:    ld.volatile.shared.b32 %f1, [%rd1];
-; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; CHECK-NEXT:    st.volatile.shared.b32 [%rd1], %f2;
+; CHECK-NEXT:    ld.volatile.shared.b32 %r1, [%rd1];
+; CHECK-NEXT:    add.rn.f32 %r2, %r1, 0f3F800000;
+; CHECK-NEXT:    st.volatile.shared.b32 [%rd1], %r2;
 ; CHECK-NEXT:    ret;
   %a.load = load volatile float, ptr addrspace(3) %a
   %a.add = fadd float %a.load, 1.
@@ -1956,14 +1937,13 @@ define void @shared_volatile_float(ptr addrspace(3) %a) {
 define void @shared_volatile_double(ptr addrspace(3) %a) {
 ; CHECK-LABEL: shared_volatile_double(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<2>;
-; CHECK-NEXT:    .reg .b64 %fd<3>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [shared_volatile_double_param_0];
-; CHECK-NEXT:    ld.volatile.shared.b64 %fd1, [%rd1];
-; CHECK-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT:    st.volatile.shared.b64 [%rd1], %fd2;
+; CHECK-NEXT:    ld.volatile.shared.b64 %rd2, [%rd1];
+; CHECK-NEXT:    add.rn.f64 %rd3, %rd2, 0d3FF0000000000000;
+; CHECK-NEXT:    st.volatile.shared.b64 [%rd1], %rd3;
 ; CHECK-NEXT:    ret;
   %a.load = load volatile double, ptr addrspace(3) %a
   %a.add = fadd double %a.load, 1.
@@ -2094,26 +2074,26 @@ define void @shared_unordered_sys_i64(ptr addrspace(3) %a) {
 define void @shared_unordered_sys_float(ptr addrspace(3) %a) {
 ; SM60-LABEL: shared_unordered_sys_float(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %f<3>;
+; SM60-NEXT:    .reg .b32 %r<3>;
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
 ; SM60-NEXT:    ld.param.b64 %rd1, [shared_unordered_sys_float_param_0];
-; SM60-NEXT:    ld.volatile.shared.b32 %f1, [%rd1];
-; SM60-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; SM60-NEXT:    st.volatile.shared.b32 [%rd1], %f2;
+; SM60-NEXT:    ld.volatile.shared.b32 %r1, [%rd1];
+; SM60-NEXT:    add.rn.f32 %r2, %r1, 0f3F800000;
+; SM60-NEXT:    st.volatile.shared.b32 [%rd1], %r2;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: shared_unordered_sys_float(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %f<3>;
+; SM70-NEXT:    .reg .b32 %r<3>;
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b64 %rd1, [shared_unordered_sys_float_param_0];
-; SM70-NEXT:    ld.relaxed.sys.shared.b32 %f1, [%rd1];
-; SM70-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; SM70-NEXT:    st.relaxed.sys.shared.b32 [%rd1], %f2;
+; SM70-NEXT:    ld.relaxed.sys.shared.b32 %r1, [%rd1];
+; SM70-NEXT:    add.rn.f32 %r2, %r1, 0f3F800000;
+; SM70-NEXT:    st.relaxed.sys.shared.b32 [%rd1], %r2;
 ; SM70-NEXT:    ret;
   %a.load = load atomic float, ptr addrspace(3) %a unordered, align 4
   %a.add = fadd float %a.load, 1.
@@ -2124,26 +2104,24 @@ define void @shared_unordered_sys_float(ptr addrspace(3) %a) {
 define void @shared_unordered_sys_double(ptr addrspace(3) %a) {
 ; SM60-LABEL: shared_unordered_sys_double(
 ; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-NEXT:    .reg .b64 %fd<3>;
+; SM60-NEXT:    .reg .b64 %rd<4>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
 ; SM60-NEXT:    ld.param.b64 %rd1, [shared_unordered_sys_double_param_0];
-; SM60-NEXT:    ld.volatile.shared.b64 %fd1, [%rd1];
-; SM60-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; SM60-NEXT:    st.volatile.shared.b64 [%rd1], %fd2;
+; SM60-NEXT:    ld.volatile.shared.b64 %rd2, [%rd1];
+; SM60-NEXT:    add.rn.f64 %rd3, %rd2, 0d3FF0000000000000;
+; SM60-NEXT:    st.volatile.shared.b64 [%rd1], %rd3;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: shared_unordered_sys_double(
 ; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-NEXT:    .reg .b64 %fd<3>;
+; SM70-NEXT:    .reg .b64 %rd<4>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b64 %rd1, [shared_unordered_sys_double_param_0];
-; SM70-NEXT:    ld.relaxed.sys.shared.b64 %fd1, [%rd1];
-; SM70-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; SM70-NEXT:    st.relaxed.sys.shared.b64 [%rd1], %fd2;
+; SM70-NEXT:    ld.relaxed.sys.shared.b64 %rd2, [%rd1];
+; SM70-NEXT:    add.rn.f64 %rd3, %rd2, 0d3FF0000000000000;
+; SM70-NEXT:    st.relaxed.sys.shared.b64 [%rd1], %rd3;
 ; SM70-NEXT:    ret;
   %a.load = load atomic double, ptr addrspace(3) %a unordered, align 8
   %a.add = fadd double %a.load, 1.
@@ -2227,14 +2205,14 @@ define void @shared_unordered_volatile_sys_i64(ptr addrspace(3) %a) {
 define void @shared_unordered_volatile_sys_float(ptr addrspace(3) %a) {
 ; CHECK-LABEL: shared_unordered_volatile_sys_float(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [shared_unordered_volatile_sys_float_param_0];
-; CHECK-NEXT:    ld.volatile.shared.b32 %f1, [%rd1];
-; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; CHECK-NEXT:    st.volatile.shared.b32 [%rd1], %f2;
+; CHECK-NEXT:    ld.volatile.shared.b32 %r1, [%rd1];
+; CHECK-NEXT:    add.rn.f32 %r2, %r1, 0f3F800000;
+; CHECK-NEXT:    st.volatile.shared.b32 [%rd1], %r2;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic volatile float, ptr addrspace(3) %a unordered, align 4
   %a.add = fadd float %a.load, 1.
@@ -2245,14 +2223,13 @@ define void @shared_unordered_volatile_sys_float(ptr addrspace(3) %a) {
 define void @shared_unordered_volatile_sys_double(ptr addrspace(3) %a) {
 ; CHECK-LABEL: shared_unordered_volatile_sys_double(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<2>;
-; CHECK-NEXT:    .reg .b64 %fd<3>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [shared_unordered_volatile_sys_double_param_0];
-; CHECK-NEXT:    ld.volatile.shared.b64 %fd1, [%rd1];
-; CHECK-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT:    st.volatile.shared.b64 [%rd1], %fd2;
+; CHECK-NEXT:    ld.volatile.shared.b64 %rd2, [%rd1];
+; CHECK-NEXT:    add.rn.f64 %rd3, %rd2, 0d3FF0000000000000;
+; CHECK-NEXT:    st.volatile.shared.b64 [%rd1], %rd3;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic volatile double, ptr addrspace(3) %a unordered, align 8
   %a.add = fadd double %a.load, 1.
@@ -2383,26 +2360,26 @@ define void @shared_monotonic_sys_i64(ptr addrspace(3) %a) {
 define void @shared_monotonic_sys_float(ptr addrspace(3) %a) {
 ; SM60-LABEL: shared_monotonic_sys_float(
 ; SM60:       {
-; SM60-NEXT:    .reg .b32 %f<3>;
+; SM60-NEXT:    .reg .b32 %r<3>;
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
 ; SM60-NEXT:    ld.param.b64 %rd1, [shared_monotonic_sys_float_param_0];
-; SM60-NEXT:    ld.volatile.shared.b32 %f1, [%rd1];
-; SM60-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; SM60-NEXT:    st.volatile.shared.b32 [%rd1], %f2;
+; SM60-NEXT:    ld.volatile.shared.b32 %r1, [%rd1];
+; SM60-NEXT:    add.rn.f32 %r2, %r1, 0f3F800000;
+; SM60-NEXT:    st.volatile.shared.b32 [%rd1], %r2;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: shared_monotonic_sys_float(
 ; SM70:       {
-; SM70-NEXT:    .reg .b32 %f<3>;
+; SM70-NEXT:    .reg .b32 %r<3>;
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b64 %rd1, [shared_monotonic_sys_float_param_0];
-; SM70-NEXT:    ld.relaxed.sys.shared.b32 %f1, [%rd1];
-; SM70-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; SM70-NEXT:    st.relaxed.sys.shared.b32 [%rd1], %f2;
+; SM70-NEXT:    ld.relaxed.sys.shared.b32 %r1, [%rd1];
+; SM70-NEXT:    add.rn.f32 %r2, %r1, 0f3F800000;
+; SM70-NEXT:    st.relaxed.sys.shared.b32 [%rd1], %r2;
 ; SM70-NEXT:    ret;
   %a.load = load atomic float, ptr addrspace(3) %a monotonic, align 4
   %a.add = fadd float %a.load, 1.
@@ -2413,26 +2390,24 @@ define void @shared_monotonic_sys_float(ptr addrspace(3) %a) {
 define void @shared_monotonic_sys_double(ptr addrspace(3) %a) {
 ; SM60-LABEL: shared_monotonic_sys_double(
 ; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-NEXT:    .reg .b64 %fd<3>;
+; SM60-NEXT:    .reg .b64 %rd<4>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
 ; SM60-NEXT:    ld.param.b64 %rd1, [shared_monotonic_sys_double_param_0];
-; SM60-NEXT:    ld.volatile.shared.b64 %fd1, [%rd1];
-; SM60-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; SM60-NEXT:    st.volatile.shared.b64 [%rd1], %fd2;
+; SM60-NEXT:    ld.volatile.shared.b64 %rd2, [%rd1];
+; SM60-NEXT:    add.rn.f64 %rd3, %rd2, 0d3FF0000000000000;
+; SM60-NEXT:    st.volatile.shared.b64 [%rd1], %rd3;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: shared_monotonic_sys_double(
 ; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-NEXT:    .reg .b64 %fd<3>;
+; SM70-NEXT:    .reg .b64 %rd<4>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b64 %rd1, [shared_monotonic_sys_double_param_0];
-; SM70-NEXT:    ld.relaxed.sys.shared.b64 %fd1, [%rd1];
-; SM70-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; SM70-NEXT:    st.relaxed.sys.shared.b64 [%rd1], %fd2;
+; SM70-NEXT:    ld.relaxed.sys.shared.b64 %rd2, [%rd1];
+; SM70-NEXT:    add.rn.f64 %rd3, %rd2, 0d3FF0000000000000;
+; SM70-NEXT:    st.relaxed.sys.shared.b64 [%rd1], %rd3;
 ; SM70-NEXT:    ret;
   %a.load = load atomic double, ptr addrspace(3) %a monotonic, align 8
   %a.add = fadd double %a.load, 1.
@@ -2516,14 +2491,14 @@ define void @shared_monotonic_volatile_sys_i64(ptr addrspace(3) %a) {
 define void @shared_monotonic_volatile_sys_float(ptr addrspace(3) %a) {
 ; CHECK-LABEL: shared_monotonic_volatile_sys_float(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [shared_monotonic_volatile_sys_float_param_0];
-; CHECK-NEXT:    ld.volatile.shared.b32 %f1, [%rd1];
-; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; CHECK-NEXT:    st.volatile.shared.b32 [%rd1], %f2;
+; CHECK-NEXT:    ld.volatile.shared.b32 %r1, [%rd1];
+; CHECK-NEXT:    add.rn.f32 %r2, %r1, 0f3F800000;
+; CHECK-NEXT:    st.volatile.shared.b32 [%rd1], %r2;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic volatile float, ptr addrspace(3) %a monotonic, align 4
   %a.add = fadd float %a.load, 1.
@@ -2534,14 +2509,13 @@ define void @shared_monotonic_volatile_sys_float(ptr addrspace(3) %a) {
 define void @shared_monotonic_volatile_sys_double(ptr addrspace(3) %a) {
 ; CHECK-LABEL: shared_monotonic_volatile_sys_double(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<2>;
-; CHECK-NEXT:    .reg .b64 %fd<3>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [shared_monotonic_volatile_sys_double_param_0];
-; CHECK-NEXT:    ld.volatile.shared.b64 %fd1, [%rd1];
-; CHECK-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT:    st.volatile.shared.b64 [%rd1], %fd2;
+; CHECK-NEXT:    ld.volatile.shared.b64 %rd2, [%rd1];
+; CHECK-NEXT:    add.rn.f64 %rd3, %rd2, 0d3FF0000000000000;
+; CHECK-NEXT:    st.volatile.shared.b64 [%rd1], %rd3;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic volatile double, ptr addrspace(3) %a monotonic, align 8
   %a.add = fadd double %a.load, 1.
@@ -2627,14 +2601,14 @@ define void @local_i64(ptr addrspace(5) %a) {
 define void @local_float(ptr addrspace(5) %a) {
 ; CHECK-LABEL: local_float(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [local_float_param_0];
-; CHECK-NEXT:    ld.local.b32 %f1, [%rd1];
-; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; CHECK-NEXT:    st.local.b32 [%rd1], %f2;
+; CHECK-NEXT:    ld.local.b32 %r1, [%rd1];
+; CHECK-NEXT:    add.rn.f32 %r2, %r1, 0f3F800000;
+; CHECK-NEXT:    st.local.b32 [%rd1], %r2;
 ; CHECK-NEXT:    ret;
   %a.load = load float, ptr addrspace(5) %a
   %a.add = fadd float %a.load, 1.
@@ -2645,14 +2619,13 @@ define void @local_float(ptr addrspace(5) %a) {
 define void @local_double(ptr addrspace(5) %a) {
 ; CHECK-LABEL: local_double(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<2>;
-; CHECK-NEXT:    .reg .b64 %fd<3>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [local_double_param_0];
-; CHECK-NEXT:    ld.local.b64 %fd1, [%rd1];
-; CHECK-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT:    st.local.b64 [%rd1], %fd2;
+; CHECK-NEXT:    ld.local.b64 %rd2, [%rd1];
+; CHECK-NEXT:    add.rn.f64 %rd3, %rd2, 0d3FF0000000000000;
+; CHECK-NEXT:    st.local.b64 [%rd1], %rd3;
 ; CHECK-NEXT:    ret;
   %a.load = load double, ptr addrspace(5) %a
   %a.add = fadd double %a.load, 1.
@@ -2736,14 +2709,14 @@ define void @local_volatile_i64(ptr addrspace(5) %a) {
 define void @local_volatile_float(ptr addrspace(5) %a) {
 ; CHECK-LABEL: local_volatile_float(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [local_volatile_float_param_0];
-; CHECK-NEXT:    ld.local.b32 %f1, [%rd1];
-; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; CHECK-NEXT:    st.local.b32 [%rd1], %f2;
+; CHECK-NEXT:    ld.local.b32 %r1, [%rd1];
+; CHECK-NEXT:    add.rn.f32 %r2, %r1, 0f3F800000;
+; CHECK-NEXT:    st.local.b32 [%rd1], %r2;
 ; CHECK-NEXT:    ret;
   %a.load = load volatile float, ptr addrspace(5) %a
   %a.add = fadd float %a.load, 1.
@@ -2754,14 +2727,13 @@ define void @local_volatile_float(ptr addrspace(5) %a) {
 define void @local_volatile_double(ptr addrspace(5) %a) {
 ; CHECK-LABEL: local_volatile_double(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<2>;
-; CHECK-NEXT:    .reg .b64 %fd<3>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [local_volatile_double_param_0];
-; CHECK-NEXT:    ld.local.b64 %fd1, [%rd1];
-; CHECK-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT:    st.local.b64 [%rd1], %fd2;
+; CHECK-NEXT:    ld.local.b64 %rd2, [%rd1];
+; CHECK-NEXT:    add.rn.f64 %rd3, %rd2, 0d3FF0000000000000;
+; CHECK-NEXT:    st.local.b64 [%rd1], %rd3;
 ; CHECK-NEXT:    ret;
   %a.load = load volatile double, ptr addrspace(5) %a
   %a.add = fadd double %a.load, 1.
@@ -2845,14 +2817,14 @@ define void @local_unordered_sys_i64(ptr addrspace(5) %a) {
 define void @local_unordered_sys_float(ptr addrspace(5) %a) {
 ; CHECK-LABEL: local_unordered_sys_float(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [local_unordered_sys_float_param_0];
-; CHECK-NEXT:    ld.local.b32 %f1, [%rd1];
-; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; CHECK-NEXT:    st.local.b32 [%rd1], %f2;
+; CHECK-NEXT:    ld.local.b32 %r1, [%rd1];
+; CHECK-NEXT:    add.rn.f32 %r2, %r1, 0f3F800000;
+; CHECK-NEXT:    st.local.b32 [%rd1], %r2;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic float, ptr addrspace(5) %a unordered, align 4
   %a.add = fadd float %a.load, 1.
@@ -2863,14 +2835,13 @@ define void @local_unordered_sys_float(ptr addrspace(5) %a) {
 define void @local_unordered_sys_double(ptr addrspace(5) %a) {
 ; CHECK-LABEL: local_unordered_sys_double(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<2>;
-; CHECK-NEXT:    .reg .b64 %fd<3>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [local_unordered_sys_double_param_0];
-; CHECK-NEXT:    ld.local.b64 %fd1, [%rd1];
-; CHECK-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT:    st.local.b64 [%rd1], %fd2;
+; CHECK-NEXT:    ld.local.b64 %rd2, [%rd1];
+; CHECK-NEXT:    add.rn.f64 %rd3, %rd2, 0d3FF0000000000000;
+; CHECK-NEXT:    st.local.b64 [%rd1], %rd3;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic double, ptr addrspace(5) %a unordered, align 8
   %a.add = fadd double %a.load, 1.
@@ -2954,14 +2925,14 @@ define void @local_unordered_volatile_sys_i64(ptr addrspace(5) %a) {
 define void @local_unordered_volatile_sys_float(ptr addrspace(5) %a) {
 ; CHECK-LABEL: local_unordered_volatile_sys_float(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [local_unordered_volatile_sys_float_param_0];
-; CHECK-NEXT:    ld.local.b32 %f1, [%rd1];
-; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; CHECK-NEXT:    st.local.b32 [%rd1], %f2;
+; CHECK-NEXT:    ld.local.b32 %r1, [%rd1];
+; CHECK-NEXT:    add.rn.f32 %r2, %r1, 0f3F800000;
+; CHECK-NEXT:    st.local.b32 [%rd1], %r2;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic volatile float, ptr addrspace(5) %a unordered, align 4
   %a.add = fadd float %a.load, 1.
@@ -2972,14 +2943,13 @@ define void @local_unordered_volatile_sys_float(ptr addrspace(5) %a) {
 define void @local_unordered_volatile_sys_double(ptr addrspace(5) %a) {
 ; CHECK-LABEL: local_unordered_volatile_sys_double(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<2>;
-; CHECK-NEXT:    .reg .b64 %fd<3>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [local_unordered_volatile_sys_double_param_0];
-; CHECK-NEXT:    ld.local.b64 %fd1, [%rd1];
-; CHECK-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT:    st.local.b64 [%rd1], %fd2;
+; CHECK-NEXT:    ld.local.b64 %rd2, [%rd1];
+; CHECK-NEXT:    add.rn.f64 %rd3, %rd2, 0d3FF0000000000000;
+; CHECK-NEXT:    st.local.b64 [%rd1], %rd3;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic volatile double, ptr addrspace(5) %a unordered, align 8
   %a.add = fadd double %a.load, 1.
@@ -3063,14 +3033,14 @@ define void @local_monotonic_sys_i64(ptr addrspace(5) %a) {
 define void @local_monotonic_sys_float(ptr addrspace(5) %a) {
 ; CHECK-LABEL: local_monotonic_sys_float(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [local_monotonic_sys_float_param_0];
-; CHECK-NEXT:    ld.local.b32 %f1, [%rd1];
-; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; CHECK-NEXT:    st.local.b32 [%rd1], %f2;
+; CHECK-NEXT:    ld.local.b32 %r1, [%rd1];
+; CHECK-NEXT:    add.rn.f32 %r2, %r1, 0f3F800000;
+; CHECK-NEXT:    st.local.b32 [%rd1], %r2;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic float, ptr addrspace(5) %a monotonic, align 4
   %a.add = fadd float %a.load, 1.
@@ -3081,14 +3051,13 @@ define void @local_monotonic_sys_float(ptr addrspace(5) %a) {
 define void @local_monotonic_sys_double(ptr addrspace(5) %a) {
 ; CHECK-LABEL: local_monotonic_sys_double(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<2>;
-; CHECK-NEXT:    .reg .b64 %fd<3>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [local_monotonic_sys_double_param_0];
-; CHECK-NEXT:    ld.local.b64 %fd1, [%rd1];
-; CHECK-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT:    st.local.b64 [%rd1], %fd2;
+; CHECK-NEXT:    ld.local.b64 %rd2, [%rd1];
+; CHECK-NEXT:    add.rn.f64 %rd3, %rd2, 0d3FF0000000000000;
+; CHECK-NEXT:    st.local.b64 [%rd1], %rd3;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic double, ptr addrspace(5) %a monotonic, align 8
   %a.add = fadd double %a.load, 1.
@@ -3172,14 +3141,14 @@ define void @local_monotonic_volatile_sys_i64(ptr addrspace(5) %a) {
 define void @local_monotonic_volatile_sys_float(ptr addrspace(5) %a) {
 ; CHECK-LABEL: local_monotonic_volatile_sys_float(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [local_monotonic_volatile_sys_float_param_0];
-; CHECK-NEXT:    ld.local.b32 %f1, [%rd1];
-; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; CHECK-NEXT:    st.local.b32 [%rd1], %f2;
+; CHECK-NEXT:    ld.local.b32 %r1, [%rd1];
+; CHECK-NEXT:    add.rn.f32 %r2, %r1, 0f3F800000;
+; CHECK-NEXT:    st.local.b32 [%rd1], %r2;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic volatile float, ptr addrspace(5) %a monotonic, align 4
   %a.add = fadd float %a.load, 1.
@@ -3190,14 +3159,13 @@ define void @local_monotonic_volatile_sys_float(ptr addrspace(5) %a) {
 define void @local_monotonic_volatile_sys_double(ptr addrspace(5) %a) {
 ; CHECK-LABEL: local_monotonic_volatile_sys_double(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<2>;
-; CHECK-NEXT:    .reg .b64 %fd<3>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [local_monotonic_volatile_sys_double_param_0];
-; CHECK-NEXT:    ld.local.b64 %fd1, [%rd1];
-; CHECK-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT:    st.local.b64 [%rd1], %fd2;
+; CHECK-NEXT:    ld.local.b64 %rd2, [%rd1];
+; CHECK-NEXT:    add.rn.f64 %rd3, %rd2, 0d3FF0000000000000;
+; CHECK-NEXT:    st.local.b64 [%rd1], %rd3;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic volatile double, ptr addrspace(5) %a monotonic, align 8
   %a.add = fadd double %a.load, 1.
diff --git a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll
index f967fd1381be5..2ffefd0cf461d 100644
--- a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | FileCheck %s
 ; RUN: %if ptxas-12.2 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | %ptxas-verify -arch=sm_70 %}
 
@@ -34,40 +35,59 @@
 
 ; CHECK-LABEL: generic_unordered_gpu
 define void @generic_unordered_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: ld.relaxed.gpu.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: generic_unordered_gpu(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_unordered_gpu_param_0];
+; CHECK-NEXT:    ld.relaxed.gpu.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [generic_unordered_gpu_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [generic_unordered_gpu_param_2];
+; CHECK-NEXT:    st.relaxed.gpu.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [generic_unordered_gpu_param_3];
+; CHECK-NEXT:    ld.relaxed.gpu.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [generic_unordered_gpu_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.relaxed.gpu.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.relaxed.gpu.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.relaxed.gpu.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.relaxed.gpu.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.relaxed.gpu.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.relaxed.gpu.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.relaxed.gpu.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.relaxed.gpu.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.relaxed.gpu.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr %a syncscope("device") unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.relaxed.gpu.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr %a syncscope("device") unordered, align 1
 
-  ; CHECK: ld.relaxed.gpu.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr %b syncscope("device") unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.relaxed.gpu.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr %b syncscope("device") unordered, align 2
 
-  ; CHECK: ld.relaxed.gpu.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr %c syncscope("device") unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.relaxed.gpu.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr %c syncscope("device") unordered, align 4
 
-  ; CHECK: ld.relaxed.gpu.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr %d syncscope("device") unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.relaxed.gpu.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr %d syncscope("device") unordered, align 8
 
-  ; CHECK: ld.relaxed.gpu.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr %e syncscope("device") unordered, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.relaxed.gpu.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr %e syncscope("device") unordered, align 4
 
-  ; CHECK: ld.relaxed.gpu.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr %e syncscope("device") unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.relaxed.gpu.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr %e syncscope("device") unordered, align 8
 
   ret void
@@ -75,40 +95,59 @@ define void @generic_unordered_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local
 
 ; CHECK-LABEL: generic_unordered_volatile_gpu
 define void @generic_unordered_volatile_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: ld.volatile.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: generic_unordered_volatile_gpu(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_unordered_volatile_gpu_param_0];
+; CHECK-NEXT:    ld.volatile.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [generic_unordered_volatile_gpu_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [generic_unordered_volatile_gpu_param_2];
+; CHECK-NEXT:    st.volatile.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [generic_unordered_volatile_gpu_param_3];
+; CHECK-NEXT:    ld.volatile.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [generic_unordered_volatile_gpu_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.volatile.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.volatile.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.volatile.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.volatile.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.volatile.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.volatile.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.volatile.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.volatile.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.volatile.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr %a syncscope("device") unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.volatile.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr %a syncscope("device") unordered, align 1
 
-  ; CHECK: ld.volatile.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr %b syncscope("device") unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.volatile.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr %b syncscope("device") unordered, align 2
 
-  ; CHECK: ld.volatile.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr %c syncscope("device") unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.volatile.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr %c syncscope("device") unordered, align 4
 
-  ; CHECK: ld.volatile.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr %d syncscope("device") unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.volatile.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr %d syncscope("device") unordered, align 8
 
-  ; CHECK: ld.volatile.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr %e syncscope("device") unordered, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.volatile.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr %e syncscope("device") unordered, align 4
 
-  ; CHECK: ld.volatile.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr %e syncscope("device") unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.volatile.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr %e syncscope("device") unordered, align 8
 
   ret void
@@ -116,40 +155,59 @@ define void @generic_unordered_volatile_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr
 
 ; CHECK-LABEL: generic_unordered_cta
 define void @generic_unordered_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: ld.relaxed.cta.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: generic_unordered_cta(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_unordered_cta_param_0];
+; CHECK-NEXT:    ld.relaxed.cta.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [generic_unordered_cta_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [generic_unordered_cta_param_2];
+; CHECK-NEXT:    st.relaxed.cta.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [generic_unordered_cta_param_3];
+; CHECK-NEXT:    ld.relaxed.cta.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [generic_unordered_cta_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.relaxed.cta.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.relaxed.cta.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.relaxed.cta.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.relaxed.cta.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.relaxed.cta.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.relaxed.cta.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.relaxed.cta.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.relaxed.cta.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.relaxed.cta.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr %a syncscope("block") unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.relaxed.cta.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr %a syncscope("block") unordered, align 1
 
-  ; CHECK: ld.relaxed.cta.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr %b syncscope("block") unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.relaxed.cta.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr %b syncscope("block") unordered, align 2
 
-  ; CHECK: ld.relaxed.cta.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr %c syncscope("block") unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.relaxed.cta.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr %c syncscope("block") unordered, align 4
 
-  ; CHECK: ld.relaxed.cta.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr %d syncscope("block") unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.relaxed.cta.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr %d syncscope("block") unordered, align 8
 
-  ; CHECK: ld.relaxed.cta.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr %e syncscope("block") unordered, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.relaxed.cta.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr %e syncscope("block") unordered, align 4
 
-  ; CHECK: ld.relaxed.cta.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr %e syncscope("block") unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.relaxed.cta.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr %e syncscope("block") unordered, align 8
 
   ret void
@@ -157,40 +215,59 @@ define void @generic_unordered_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local
 
 ; CHECK-LABEL: generic_unordered_volatile_cta
 define void @generic_unordered_volatile_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: ld.volatile.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: generic_unordered_volatile_cta(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_unordered_volatile_cta_param_0];
+; CHECK-NEXT:    ld.volatile.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [generic_unordered_volatile_cta_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [generic_unordered_volatile_cta_param_2];
+; CHECK-NEXT:    st.volatile.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [generic_unordered_volatile_cta_param_3];
+; CHECK-NEXT:    ld.volatile.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [generic_unordered_volatile_cta_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.volatile.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.volatile.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.volatile.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.volatile.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.volatile.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.volatile.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.volatile.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.volatile.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.volatile.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr %a syncscope("block") unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.volatile.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr %a syncscope("block") unordered, align 1
 
-  ; CHECK: ld.volatile.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr %b syncscope("block") unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.volatile.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr %b syncscope("block") unordered, align 2
 
-  ; CHECK: ld.volatile.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr %c syncscope("block") unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.volatile.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr %c syncscope("block") unordered, align 4
 
-  ; CHECK: ld.volatile.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr %d syncscope("block") unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.volatile.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr %d syncscope("block") unordered, align 8
 
-  ; CHECK: ld.volatile.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr %e syncscope("block") unordered, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.volatile.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr %e syncscope("block") unordered, align 4
 
-  ; CHECK: ld.volatile.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr %e syncscope("block") unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.volatile.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr %e syncscope("block") unordered, align 8
 
   ret void
@@ -198,40 +275,59 @@ define void @generic_unordered_volatile_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr
 
 ; CHECK-LABEL: generic_monotonic_gpu
 define void @generic_monotonic_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: ld.relaxed.gpu.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: generic_monotonic_gpu(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_monotonic_gpu_param_0];
+; CHECK-NEXT:    ld.relaxed.gpu.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [generic_monotonic_gpu_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [generic_monotonic_gpu_param_2];
+; CHECK-NEXT:    st.relaxed.gpu.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [generic_monotonic_gpu_param_3];
+; CHECK-NEXT:    ld.relaxed.gpu.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [generic_monotonic_gpu_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.relaxed.gpu.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.relaxed.gpu.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.relaxed.gpu.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.relaxed.gpu.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.relaxed.gpu.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.relaxed.gpu.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.relaxed.gpu.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.relaxed.gpu.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.relaxed.gpu.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr %a syncscope("device") monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.relaxed.gpu.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr %a syncscope("device") monotonic, align 1
 
-  ; CHECK: ld.relaxed.gpu.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr %b syncscope("device") monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.relaxed.gpu.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr %b syncscope("device") monotonic, align 2
 
-  ; CHECK: ld.relaxed.gpu.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr %c syncscope("device") monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.relaxed.gpu.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr %c syncscope("device") monotonic, align 4
 
-  ; CHECK: ld.relaxed.gpu.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr %d syncscope("device") monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.relaxed.gpu.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr %d syncscope("device") monotonic, align 8
 
-  ; CHECK: ld.relaxed.gpu.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr %e syncscope("device") monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.relaxed.gpu.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr %e syncscope("device") monotonic, align 4
 
-  ; CHECK: ld.relaxed.gpu.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr %e syncscope("device") monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.relaxed.gpu.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr %e syncscope("device") monotonic, align 8
 
   ret void
@@ -239,40 +335,59 @@ define void @generic_monotonic_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local
 
 ; CHECK-LABEL: generic_monotonic_volatile_gpu
 define void @generic_monotonic_volatile_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: ld.volatile.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: generic_monotonic_volatile_gpu(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_monotonic_volatile_gpu_param_0];
+; CHECK-NEXT:    ld.volatile.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [generic_monotonic_volatile_gpu_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [generic_monotonic_volatile_gpu_param_2];
+; CHECK-NEXT:    st.volatile.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [generic_monotonic_volatile_gpu_param_3];
+; CHECK-NEXT:    ld.volatile.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [generic_monotonic_volatile_gpu_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.volatile.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.volatile.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.volatile.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.volatile.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.volatile.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.volatile.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.volatile.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.volatile.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.volatile.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr %a syncscope("device") monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.volatile.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr %a syncscope("device") monotonic, align 1
 
-  ; CHECK: ld.volatile.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr %b syncscope("device") monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.volatile.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr %b syncscope("device") monotonic, align 2
 
-  ; CHECK: ld.volatile.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr %c syncscope("device") monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.volatile.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr %c syncscope("device") monotonic, align 4
 
-  ; CHECK: ld.volatile.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr %d syncscope("device") monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.volatile.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr %d syncscope("device") monotonic, align 8
 
-  ; CHECK: ld.volatile.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr %e syncscope("device") monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.volatile.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr %e syncscope("device") monotonic, align 4
 
-  ; CHECK: ld.volatile.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr %e syncscope("device") monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.volatile.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr %e syncscope("device") monotonic, align 8
 
   ret void
@@ -280,40 +395,59 @@ define void @generic_monotonic_volatile_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr
 
 ; CHECK-LABEL: generic_monotonic_cta
 define void @generic_monotonic_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: ld.relaxed.cta.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: generic_monotonic_cta(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_monotonic_cta_param_0];
+; CHECK-NEXT:    ld.relaxed.cta.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [generic_monotonic_cta_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [generic_monotonic_cta_param_2];
+; CHECK-NEXT:    st.relaxed.cta.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [generic_monotonic_cta_param_3];
+; CHECK-NEXT:    ld.relaxed.cta.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [generic_monotonic_cta_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.relaxed.cta.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.relaxed.cta.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.relaxed.cta.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.relaxed.cta.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.relaxed.cta.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.relaxed.cta.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.relaxed.cta.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.relaxed.cta.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.relaxed.cta.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr %a syncscope("block") monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.relaxed.cta.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr %a syncscope("block") monotonic, align 1
 
-  ; CHECK: ld.relaxed.cta.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr %b syncscope("block") monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.relaxed.cta.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr %b syncscope("block") monotonic, align 2
 
-  ; CHECK: ld.relaxed.cta.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr %c syncscope("block") monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.relaxed.cta.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr %c syncscope("block") monotonic, align 4
 
-  ; CHECK: ld.relaxed.cta.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr %d syncscope("block") monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.relaxed.cta.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr %d syncscope("block") monotonic, align 8
 
-  ; CHECK: ld.relaxed.cta.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr %e syncscope("block") monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.relaxed.cta.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr %e syncscope("block") monotonic, align 4
 
-  ; CHECK: ld.relaxed.cta.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr %e syncscope("block") monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.relaxed.cta.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr %e syncscope("block") monotonic, align 8
 
   ret void
@@ -321,40 +455,59 @@ define void @generic_monotonic_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local
 
 ; CHECK-LABEL: generic_monotonic_volatile_cta
 define void @generic_monotonic_volatile_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: ld.volatile.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: generic_monotonic_volatile_cta(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_monotonic_volatile_cta_param_0];
+; CHECK-NEXT:    ld.volatile.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [generic_monotonic_volatile_cta_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [generic_monotonic_volatile_cta_param_2];
+; CHECK-NEXT:    st.volatile.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [generic_monotonic_volatile_cta_param_3];
+; CHECK-NEXT:    ld.volatile.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [generic_monotonic_volatile_cta_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.volatile.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.volatile.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.volatile.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.volatile.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.volatile.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.volatile.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.volatile.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.volatile.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.volatile.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr %a syncscope("block") monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.volatile.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr %a syncscope("block") monotonic, align 1
 
-  ; CHECK: ld.volatile.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr %b syncscope("block") monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.volatile.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr %b syncscope("block") monotonic, align 2
 
-  ; CHECK: ld.volatile.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr %c syncscope("block") monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.volatile.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr %c syncscope("block") monotonic, align 4
 
-  ; CHECK: ld.volatile.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr %d syncscope("block") monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.volatile.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr %d syncscope("block") monotonic, align 8
 
-  ; CHECK: ld.volatile.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr %e syncscope("block") monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.volatile.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr %e syncscope("block") monotonic, align 4
 
-  ; CHECK: ld.volatile.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr %e syncscope("block") monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.volatile.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr %e syncscope("block") monotonic, align 8
 
   ret void
@@ -362,40 +515,59 @@ define void @generic_monotonic_volatile_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr
 
 ; CHECK-LABEL: generic_acq_rel_sys
 define void @generic_acq_rel_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.sys.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: generic_acq_rel_sys(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_acq_rel_sys_param_0];
+; CHECK-NEXT:    ld.acquire.sys.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [generic_acq_rel_sys_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [generic_acq_rel_sys_param_2];
+; CHECK-NEXT:    st.release.sys.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [generic_acq_rel_sys_param_3];
+; CHECK-NEXT:    ld.acquire.sys.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [generic_acq_rel_sys_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.release.sys.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.acquire.sys.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.release.sys.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.acquire.sys.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.release.sys.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.acquire.sys.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.release.sys.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.acquire.sys.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.release.sys.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr %a acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.sys.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr %a release, align 1
 
-  ; CHECK: ld.acquire.sys.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr %b acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.sys.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr %b release, align 2
 
-  ; CHECK: ld.acquire.sys.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr %c acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr %c release, align 4
 
-  ; CHECK: ld.acquire.sys.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr %d acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr %d release, align 8
 
-  ; CHECK: ld.acquire.sys.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr %e acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr %e release, align 4
 
-  ; CHECK: ld.acquire.sys.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr %e acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr %e release, align 8
 
   ret void
@@ -403,40 +575,59 @@ define void @generic_acq_rel_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_u
 
 ; CHECK-LABEL: generic_acq_rel_volatile_sys
 define void @generic_acq_rel_volatile_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.sys.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: generic_acq_rel_volatile_sys(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_acq_rel_volatile_sys_param_0];
+; CHECK-NEXT:    ld.acquire.sys.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [generic_acq_rel_volatile_sys_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [generic_acq_rel_volatile_sys_param_2];
+; CHECK-NEXT:    st.release.sys.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [generic_acq_rel_volatile_sys_param_3];
+; CHECK-NEXT:    ld.acquire.sys.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [generic_acq_rel_volatile_sys_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.release.sys.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.acquire.sys.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.release.sys.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.acquire.sys.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.release.sys.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.acquire.sys.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.release.sys.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.acquire.sys.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.release.sys.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr %a acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.sys.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr %a release, align 1
 
-  ; CHECK: ld.acquire.sys.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr %b acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.sys.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr %b release, align 2
 
-  ; CHECK: ld.acquire.sys.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr %c acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr %c release, align 4
 
-  ; CHECK: ld.acquire.sys.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr %d acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr %d release, align 8
 
-  ; CHECK: ld.acquire.sys.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr %e acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr %e release, align 4
 
-  ; CHECK: ld.acquire.sys.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr %e acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr %e release, align 8
 
   ret void
@@ -444,40 +635,59 @@ define void @generic_acq_rel_volatile_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e
 
 ; CHECK-LABEL: generic_acq_rel_gpu
 define void @generic_acq_rel_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.gpu.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: generic_acq_rel_gpu(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_acq_rel_gpu_param_0];
+; CHECK-NEXT:    ld.acquire.gpu.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [generic_acq_rel_gpu_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [generic_acq_rel_gpu_param_2];
+; CHECK-NEXT:    st.release.gpu.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [generic_acq_rel_gpu_param_3];
+; CHECK-NEXT:    ld.acquire.gpu.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [generic_acq_rel_gpu_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.release.gpu.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.acquire.gpu.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.release.gpu.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.acquire.gpu.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.release.gpu.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.acquire.gpu.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.release.gpu.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.acquire.gpu.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.release.gpu.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr %a syncscope("device") acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.gpu.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr %a syncscope("device") release, align 1
 
-  ; CHECK: ld.acquire.gpu.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr %b syncscope("device") acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.gpu.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr %b syncscope("device") release, align 2
 
-  ; CHECK: ld.acquire.gpu.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr %c syncscope("device") acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.gpu.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr %c syncscope("device") release, align 4
 
-  ; CHECK: ld.acquire.gpu.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr %d syncscope("device") acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.gpu.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr %d syncscope("device") release, align 8
 
-  ; CHECK: ld.acquire.gpu.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr %e syncscope("device") acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.release.gpu.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr %e syncscope("device") release, align 4
 
-  ; CHECK: ld.acquire.gpu.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr %e syncscope("device") acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.gpu.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr %e syncscope("device") release, align 8
 
   ret void
@@ -485,40 +695,59 @@ define void @generic_acq_rel_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_u
 
 ; CHECK-LABEL: generic_acq_rel_volatile_gpu
 define void @generic_acq_rel_volatile_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.sys.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: generic_acq_rel_volatile_gpu(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_acq_rel_volatile_gpu_param_0];
+; CHECK-NEXT:    ld.acquire.sys.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [generic_acq_rel_volatile_gpu_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [generic_acq_rel_volatile_gpu_param_2];
+; CHECK-NEXT:    st.release.sys.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [generic_acq_rel_volatile_gpu_param_3];
+; CHECK-NEXT:    ld.acquire.sys.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [generic_acq_rel_volatile_gpu_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.release.sys.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.acquire.sys.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.release.sys.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.acquire.sys.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.release.sys.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.acquire.sys.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.release.sys.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.acquire.sys.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.release.sys.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr %a syncscope("device") acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.sys.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr %a syncscope("device") release, align 1
 
-  ; CHECK: ld.acquire.sys.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr %b syncscope("device") acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.sys.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr %b syncscope("device") release, align 2
 
-  ; CHECK: ld.acquire.sys.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr %c syncscope("device") acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr %c syncscope("device") release, align 4
 
-  ; CHECK: ld.acquire.sys.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr %d syncscope("device") acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr %d syncscope("device") release, align 8
 
-  ; CHECK: ld.acquire.sys.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr %e syncscope("device") acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr %e syncscope("device") release, align 4
 
-  ; CHECK: ld.acquire.sys.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr %e syncscope("device") acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr %e syncscope("device") release, align 8
 
   ret void
@@ -526,40 +755,59 @@ define void @generic_acq_rel_volatile_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e
 
 ; CHECK-LABEL: generic_acq_rel_cta
 define void @generic_acq_rel_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.cta.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: generic_acq_rel_cta(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_acq_rel_cta_param_0];
+; CHECK-NEXT:    ld.acquire.cta.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [generic_acq_rel_cta_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [generic_acq_rel_cta_param_2];
+; CHECK-NEXT:    st.release.cta.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [generic_acq_rel_cta_param_3];
+; CHECK-NEXT:    ld.acquire.cta.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [generic_acq_rel_cta_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.release.cta.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.acquire.cta.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.release.cta.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.acquire.cta.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.release.cta.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.acquire.cta.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.release.cta.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.acquire.cta.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.release.cta.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr %a syncscope("block") acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.cta.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr %a syncscope("block") release, align 1
 
-  ; CHECK: ld.acquire.cta.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr %b syncscope("block") acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.cta.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr %b syncscope("block") release, align 2
 
-  ; CHECK: ld.acquire.cta.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr %c syncscope("block") acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.cta.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr %c syncscope("block") release, align 4
 
-  ; CHECK: ld.acquire.cta.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr %d syncscope("block") acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.cta.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr %d syncscope("block") release, align 8
 
-  ; CHECK: ld.acquire.cta.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr %e syncscope("block") acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.release.cta.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr %e syncscope("block") release, align 4
 
-  ; CHECK: ld.acquire.cta.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr %e syncscope("block") acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.cta.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr %e syncscope("block") release, align 8
 
   ret void
@@ -567,40 +815,59 @@ define void @generic_acq_rel_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_u
 
 ; CHECK-LABEL: generic_acq_rel_volatile_cta
 define void @generic_acq_rel_volatile_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.sys.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: generic_acq_rel_volatile_cta(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_acq_rel_volatile_cta_param_0];
+; CHECK-NEXT:    ld.acquire.sys.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [generic_acq_rel_volatile_cta_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [generic_acq_rel_volatile_cta_param_2];
+; CHECK-NEXT:    st.release.sys.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [generic_acq_rel_volatile_cta_param_3];
+; CHECK-NEXT:    ld.acquire.sys.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [generic_acq_rel_volatile_cta_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.release.sys.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.acquire.sys.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.release.sys.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.acquire.sys.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.release.sys.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.acquire.sys.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.release.sys.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.acquire.sys.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.release.sys.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr %a syncscope("block") acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.sys.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr %a syncscope("block") release, align 1
 
-  ; CHECK: ld.acquire.sys.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr %b syncscope("block") acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.sys.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr %b syncscope("block") release, align 2
 
-  ; CHECK: ld.acquire.sys.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr %c syncscope("block") acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr %c syncscope("block") release, align 4
 
-  ; CHECK: ld.acquire.sys.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr %d syncscope("block") acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr %d syncscope("block") release, align 8
 
-  ; CHECK: ld.acquire.sys.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr %e syncscope("block") acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr %e syncscope("block") release, align 4
 
-  ; CHECK: ld.acquire.sys.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr %e syncscope("block") acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr %e syncscope("block") release, align 8
 
   ret void
@@ -608,52 +875,71 @@ define void @generic_acq_rel_volatile_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e
 
 ; CHECK-LABEL: generic_sc_sys
 define void @generic_sc_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: generic_sc_sys(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_sc_sys_param_0];
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [generic_sc_sys_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [generic_sc_sys_param_2];
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [generic_sc_sys_param_3];
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [generic_sc_sys_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.b16 [%rd2], %rs4;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.b32 [%rd3], %r2;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.b64 [%rd4], %rd7;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.b32 [%rd5], %r4;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr %a seq_cst, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr %a seq_cst, align 1
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr %b seq_cst, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr %b seq_cst, align 2
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr %c seq_cst, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr %c seq_cst, align 4
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr %d seq_cst, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr %d seq_cst, align 8
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr %e seq_cst, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr %e seq_cst, align 4
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr %e seq_cst, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr %e seq_cst, align 8
 
   ret void
@@ -661,52 +947,71 @@ define void @generic_sc_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unname
 
 ; CHECK-LABEL: generic_sc_volatile_sys
 define void @generic_sc_volatile_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: generic_sc_volatile_sys(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_sc_volatile_sys_param_0];
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [generic_sc_volatile_sys_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [generic_sc_volatile_sys_param_2];
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [generic_sc_volatile_sys_param_3];
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [generic_sc_volatile_sys_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.b16 [%rd2], %rs4;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.b32 [%rd3], %r2;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.b64 [%rd4], %rd7;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.b32 [%rd5], %r4;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr %a seq_cst, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr %a seq_cst, align 1
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr %b seq_cst, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr %b seq_cst, align 2
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr %c seq_cst, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr %c seq_cst, align 4
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr %d seq_cst, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr %d seq_cst, align 8
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr %e seq_cst, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr %e seq_cst, align 4
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr %e seq_cst, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr %e seq_cst, align 8
 
   ret void
@@ -714,52 +1019,71 @@ define void @generic_sc_volatile_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) loc
 
 ; CHECK-LABEL: generic_sc_gpu
 define void @generic_sc_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: fence.sc.gpu
-  ; CHECK: ld.acquire.gpu.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: generic_sc_gpu(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_sc_gpu_param_0];
+; CHECK-NEXT:    fence.sc.gpu;
+; CHECK-NEXT:    ld.acquire.gpu.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [generic_sc_gpu_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [generic_sc_gpu_param_2];
+; CHECK-NEXT:    fence.sc.gpu;
+; CHECK-NEXT:    st.release.gpu.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [generic_sc_gpu_param_3];
+; CHECK-NEXT:    fence.sc.gpu;
+; CHECK-NEXT:    ld.acquire.gpu.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [generic_sc_gpu_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    fence.sc.gpu;
+; CHECK-NEXT:    st.release.gpu.b16 [%rd2], %rs4;
+; CHECK-NEXT:    fence.sc.gpu;
+; CHECK-NEXT:    ld.acquire.gpu.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    fence.sc.gpu;
+; CHECK-NEXT:    st.release.gpu.b32 [%rd3], %r2;
+; CHECK-NEXT:    fence.sc.gpu;
+; CHECK-NEXT:    ld.acquire.gpu.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    fence.sc.gpu;
+; CHECK-NEXT:    st.release.gpu.b64 [%rd4], %rd7;
+; CHECK-NEXT:    fence.sc.gpu;
+; CHECK-NEXT:    ld.acquire.gpu.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    fence.sc.gpu;
+; CHECK-NEXT:    st.release.gpu.b32 [%rd5], %r4;
+; CHECK-NEXT:    fence.sc.gpu;
+; CHECK-NEXT:    ld.acquire.gpu.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    fence.sc.gpu;
+; CHECK-NEXT:    st.release.gpu.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr %a syncscope("device") seq_cst, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: fence.sc.gpu
-  ; CHECK: st.release.gpu.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr %a syncscope("device") seq_cst, align 1
 
-  ; CHECK: fence.sc.gpu
-  ; CHECK: ld.acquire.gpu.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr %b syncscope("device") seq_cst, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: fence.sc.gpu
-  ; CHECK: st.release.gpu.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr %b syncscope("device") seq_cst, align 2
 
-  ; CHECK: fence.sc.gpu
-  ; CHECK: ld.acquire.gpu.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr %c syncscope("device") seq_cst, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: fence.sc.gpu
-  ; CHECK: st.release.gpu.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr %c syncscope("device") seq_cst, align 4
 
-  ; CHECK: fence.sc.gpu
-  ; CHECK: ld.acquire.gpu.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr %d syncscope("device") seq_cst, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: fence.sc.gpu
-  ; CHECK: st.release.gpu.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr %d syncscope("device") seq_cst, align 8
 
-  ; CHECK: fence.sc.gpu
-  ; CHECK: ld.acquire.gpu.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr %e syncscope("device") seq_cst, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: fence.sc.gpu
-  ; CHECK: st.release.gpu.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr %e syncscope("device") seq_cst, align 4
 
-  ; CHECK: fence.sc.gpu
-  ; CHECK: ld.acquire.gpu.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr %e syncscope("device") seq_cst, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: fence.sc.gpu
-  ; CHECK: st.release.gpu.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr %e syncscope("device") seq_cst, align 8
 
   ret void
@@ -767,52 +1091,71 @@ define void @generic_sc_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unname
 
 ; CHECK-LABEL: generic_sc_volatile_gpu
 define void @generic_sc_volatile_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]  
+; CHECK-LABEL: generic_sc_volatile_gpu(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_sc_volatile_gpu_param_0];
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [generic_sc_volatile_gpu_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [generic_sc_volatile_gpu_param_2];
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [generic_sc_volatile_gpu_param_3];
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [generic_sc_volatile_gpu_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.b16 [%rd2], %rs4;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.b32 [%rd3], %r2;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.b64 [%rd4], %rd7;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.b32 [%rd5], %r4;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr %a syncscope("device") seq_cst, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr %a syncscope("device") seq_cst, align 1
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr %b syncscope("device") seq_cst, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr %b syncscope("device") seq_cst, align 2
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr %c syncscope("device") seq_cst, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr %c syncscope("device") seq_cst, align 4
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr %d syncscope("device") seq_cst, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr %d syncscope("device") seq_cst, align 8
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr %e syncscope("device") seq_cst, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr %e syncscope("device") seq_cst, align 4
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr %e syncscope("device") seq_cst, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr %e syncscope("device") seq_cst, align 8
 
   ret void
@@ -820,52 +1163,71 @@ define void @generic_sc_volatile_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) loc
 
 ; CHECK-LABEL: generic_sc_cta
 define void @generic_sc_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: fence.sc.cta
-  ; CHECK: ld.acquire.cta.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: generic_sc_cta(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_sc_cta_param_0];
+; CHECK-NEXT:    fence.sc.cta;
+; CHECK-NEXT:    ld.acquire.cta.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [generic_sc_cta_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [generic_sc_cta_param_2];
+; CHECK-NEXT:    fence.sc.cta;
+; CHECK-NEXT:    st.release.cta.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [generic_sc_cta_param_3];
+; CHECK-NEXT:    fence.sc.cta;
+; CHECK-NEXT:    ld.acquire.cta.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [generic_sc_cta_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    fence.sc.cta;
+; CHECK-NEXT:    st.release.cta.b16 [%rd2], %rs4;
+; CHECK-NEXT:    fence.sc.cta;
+; CHECK-NEXT:    ld.acquire.cta.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    fence.sc.cta;
+; CHECK-NEXT:    st.release.cta.b32 [%rd3], %r2;
+; CHECK-NEXT:    fence.sc.cta;
+; CHECK-NEXT:    ld.acquire.cta.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    fence.sc.cta;
+; CHECK-NEXT:    st.release.cta.b64 [%rd4], %rd7;
+; CHECK-NEXT:    fence.sc.cta;
+; CHECK-NEXT:    ld.acquire.cta.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    fence.sc.cta;
+; CHECK-NEXT:    st.release.cta.b32 [%rd5], %r4;
+; CHECK-NEXT:    fence.sc.cta;
+; CHECK-NEXT:    ld.acquire.cta.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    fence.sc.cta;
+; CHECK-NEXT:    st.release.cta.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr %a syncscope("block") seq_cst, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: fence.sc.cta
-  ; CHECK: st.release.cta.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr %a syncscope("block") seq_cst, align 1
 
-  ; CHECK: fence.sc.cta
-  ; CHECK: ld.acquire.cta.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr %b syncscope("block") seq_cst, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: fence.sc.cta
-  ; CHECK: st.release.cta.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr %b syncscope("block") seq_cst, align 2
 
-  ; CHECK: fence.sc.cta
-  ; CHECK: ld.acquire.cta.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr %c syncscope("block") seq_cst, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: fence.sc.cta
-  ; CHECK: st.release.cta.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr %c syncscope("block") seq_cst, align 4
 
-  ; CHECK: fence.sc.cta
-  ; CHECK: ld.acquire.cta.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr %d syncscope("block") seq_cst, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: fence.sc.cta
-  ; CHECK: st.release.cta.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr %d syncscope("block") seq_cst, align 8
 
-  ; CHECK: fence.sc.cta
-  ; CHECK: ld.acquire.cta.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr %e syncscope("block") seq_cst, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: fence.sc.cta
-  ; CHECK: st.release.cta.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr %e syncscope("block") seq_cst, align 4
 
-  ; CHECK: fence.sc.cta
-  ; CHECK: ld.acquire.cta.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr %e syncscope("block") seq_cst, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: fence.sc.cta
-  ; CHECK: st.release.cta.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr %e syncscope("block") seq_cst, align 8
 
   ret void
@@ -873,52 +1235,71 @@ define void @generic_sc_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unname
 
 ; CHECK-LABEL: generic_sc_volatile_cta
 define void @generic_sc_volatile_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: generic_sc_volatile_cta(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_sc_volatile_cta_param_0];
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [generic_sc_volatile_cta_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [generic_sc_volatile_cta_param_2];
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [generic_sc_volatile_cta_param_3];
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [generic_sc_volatile_cta_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.b16 [%rd2], %rs4;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.b32 [%rd3], %r2;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.b64 [%rd4], %rd7;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.b32 [%rd5], %r4;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr %a syncscope("block") seq_cst, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr %a syncscope("block") seq_cst, align 1
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr %b syncscope("block") seq_cst, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr %b syncscope("block") seq_cst, align 2
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr %c syncscope("block") seq_cst, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr %c syncscope("block") seq_cst, align 4
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr %d syncscope("block") seq_cst, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr %d syncscope("block") seq_cst, align 8
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr %e syncscope("block") seq_cst, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr %e syncscope("block") seq_cst, align 4
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr %e syncscope("block") seq_cst, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr %e syncscope("block") seq_cst, align 8
 
   ret void
@@ -928,40 +1309,59 @@ define void @generic_sc_volatile_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) loc
 
 ; CHECK-LABEL: global_unordered_gpu
 define void @global_unordered_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: ld.relaxed.gpu.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: global_unordered_gpu(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_unordered_gpu_param_0];
+; CHECK-NEXT:    ld.relaxed.gpu.global.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [global_unordered_gpu_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [global_unordered_gpu_param_2];
+; CHECK-NEXT:    st.relaxed.gpu.global.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [global_unordered_gpu_param_3];
+; CHECK-NEXT:    ld.relaxed.gpu.global.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [global_unordered_gpu_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.relaxed.gpu.global.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.relaxed.gpu.global.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.relaxed.gpu.global.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.relaxed.gpu.global.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.relaxed.gpu.global.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.relaxed.gpu.global.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.relaxed.gpu.global.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.relaxed.gpu.global.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.relaxed.gpu.global.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(1) %a syncscope("device") unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.relaxed.gpu.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(1) %a syncscope("device") unordered, align 1
 
-  ; CHECK: ld.relaxed.gpu.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(1) %b syncscope("device") unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.relaxed.gpu.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(1) %b syncscope("device") unordered, align 2
 
-  ; CHECK: ld.relaxed.gpu.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(1) %c syncscope("device") unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.relaxed.gpu.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(1) %c syncscope("device") unordered, align 4
 
-  ; CHECK: ld.relaxed.gpu.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(1) %d syncscope("device") unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.relaxed.gpu.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(1) %d syncscope("device") unordered, align 8
 
-  ; CHECK: ld.relaxed.gpu.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(1) %e syncscope("device") unordered, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.relaxed.gpu.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(1) %e syncscope("device") unordered, align 4
 
-  ; CHECK: ld.relaxed.gpu.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(1) %e syncscope("device") unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.relaxed.gpu.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(1) %e syncscope("device") unordered, align 8
 
   ret void
@@ -969,40 +1369,59 @@ define void @global_unordered_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr
 
 ; CHECK-LABEL: global_unordered_volatile_gpu
 define void @global_unordered_volatile_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: ld.mmio.relaxed.sys.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: global_unordered_volatile_gpu(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_unordered_volatile_gpu_param_0];
+; CHECK-NEXT:    ld.mmio.relaxed.sys.global.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [global_unordered_volatile_gpu_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [global_unordered_volatile_gpu_param_2];
+; CHECK-NEXT:    st.mmio.relaxed.sys.global.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [global_unordered_volatile_gpu_param_3];
+; CHECK-NEXT:    ld.mmio.relaxed.sys.global.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [global_unordered_volatile_gpu_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.mmio.relaxed.sys.global.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.mmio.relaxed.sys.global.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.mmio.relaxed.sys.global.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.mmio.relaxed.sys.global.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.mmio.relaxed.sys.global.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.mmio.relaxed.sys.global.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.mmio.relaxed.sys.global.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.mmio.relaxed.sys.global.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.mmio.relaxed.sys.global.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("device") unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.mmio.relaxed.sys.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("device") unordered, align 1
 
-  ; CHECK: ld.mmio.relaxed.sys.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("device") unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.mmio.relaxed.sys.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("device") unordered, align 2
 
-  ; CHECK: ld.mmio.relaxed.sys.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("device") unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.mmio.relaxed.sys.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("device") unordered, align 4
 
-  ; CHECK: ld.mmio.relaxed.sys.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("device") unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.mmio.relaxed.sys.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("device") unordered, align 8
 
-  ; CHECK: ld.mmio.relaxed.sys.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("device") unordered, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.mmio.relaxed.sys.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("device") unordered, align 4
 
-  ; CHECK: ld.mmio.relaxed.sys.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("device") unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.mmio.relaxed.sys.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("device") unordered, align 8
 
   ret void
@@ -1010,40 +1429,59 @@ define void @global_unordered_volatile_gpu(ptr addrspace(1) %a, ptr addrspace(1)
 
 ; CHECK-LABEL: global_unordered_cta
 define void @global_unordered_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: ld.relaxed.cta.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: global_unordered_cta(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_unordered_cta_param_0];
+; CHECK-NEXT:    ld.relaxed.cta.global.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [global_unordered_cta_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [global_unordered_cta_param_2];
+; CHECK-NEXT:    st.relaxed.cta.global.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [global_unordered_cta_param_3];
+; CHECK-NEXT:    ld.relaxed.cta.global.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [global_unordered_cta_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.relaxed.cta.global.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.relaxed.cta.global.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.relaxed.cta.global.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.relaxed.cta.global.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.relaxed.cta.global.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.relaxed.cta.global.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.relaxed.cta.global.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.relaxed.cta.global.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.relaxed.cta.global.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(1) %a syncscope("block") unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.relaxed.cta.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(1) %a syncscope("block") unordered, align 1
 
-  ; CHECK: ld.relaxed.cta.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(1) %b syncscope("block") unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.relaxed.cta.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(1) %b syncscope("block") unordered, align 2
 
-  ; CHECK: ld.relaxed.cta.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(1) %c syncscope("block") unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.relaxed.cta.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(1) %c syncscope("block") unordered, align 4
 
-  ; CHECK: ld.relaxed.cta.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(1) %d syncscope("block") unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.relaxed.cta.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(1) %d syncscope("block") unordered, align 8
 
-  ; CHECK: ld.relaxed.cta.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(1) %e syncscope("block") unordered, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.relaxed.cta.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(1) %e syncscope("block") unordered, align 4
 
-  ; CHECK: ld.relaxed.cta.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(1) %e syncscope("block") unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.relaxed.cta.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(1) %e syncscope("block") unordered, align 8
 
   ret void
@@ -1051,40 +1489,59 @@ define void @global_unordered_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr
 
 ; CHECK-LABEL: global_unordered_volatile_cta
 define void @global_unordered_volatile_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: ld.mmio.relaxed.sys.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: global_unordered_volatile_cta(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_unordered_volatile_cta_param_0];
+; CHECK-NEXT:    ld.mmio.relaxed.sys.global.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [global_unordered_volatile_cta_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [global_unordered_volatile_cta_param_2];
+; CHECK-NEXT:    st.mmio.relaxed.sys.global.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [global_unordered_volatile_cta_param_3];
+; CHECK-NEXT:    ld.mmio.relaxed.sys.global.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [global_unordered_volatile_cta_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.mmio.relaxed.sys.global.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.mmio.relaxed.sys.global.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.mmio.relaxed.sys.global.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.mmio.relaxed.sys.global.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.mmio.relaxed.sys.global.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.mmio.relaxed.sys.global.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.mmio.relaxed.sys.global.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.mmio.relaxed.sys.global.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.mmio.relaxed.sys.global.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("block") unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.mmio.relaxed.sys.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("block") unordered, align 1
 
-  ; CHECK: ld.mmio.relaxed.sys.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("block") unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.mmio.relaxed.sys.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("block") unordered, align 2
 
-  ; CHECK: ld.mmio.relaxed.sys.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("block") unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.mmio.relaxed.sys.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("block") unordered, align 4
 
-  ; CHECK: ld.mmio.relaxed.sys.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("block") unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.mmio.relaxed.sys.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("block") unordered, align 8
 
-  ; CHECK: ld.mmio.relaxed.sys.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("block") unordered, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.mmio.relaxed.sys.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("block") unordered, align 4
 
-  ; CHECK: ld.mmio.relaxed.sys.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("block") unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.mmio.relaxed.sys.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("block") unordered, align 8
 
   ret void
@@ -1092,40 +1549,59 @@ define void @global_unordered_volatile_cta(ptr addrspace(1) %a, ptr addrspace(1)
 
 ; CHECK-LABEL: global_monotonic_gpu
 define void @global_monotonic_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: ld.relaxed.gpu.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: global_monotonic_gpu(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_monotonic_gpu_param_0];
+; CHECK-NEXT:    ld.relaxed.gpu.global.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [global_monotonic_gpu_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [global_monotonic_gpu_param_2];
+; CHECK-NEXT:    st.relaxed.gpu.global.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [global_monotonic_gpu_param_3];
+; CHECK-NEXT:    ld.relaxed.gpu.global.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [global_monotonic_gpu_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.relaxed.gpu.global.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.relaxed.gpu.global.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.relaxed.gpu.global.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.relaxed.gpu.global.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.relaxed.gpu.global.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.relaxed.gpu.global.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.relaxed.gpu.global.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.relaxed.gpu.global.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.relaxed.gpu.global.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(1) %a syncscope("device") monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.relaxed.gpu.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(1) %a syncscope("device") monotonic, align 1
 
-  ; CHECK: ld.relaxed.gpu.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(1) %b syncscope("device") monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.relaxed.gpu.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(1) %b syncscope("device") monotonic, align 2
 
-  ; CHECK: ld.relaxed.gpu.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(1) %c syncscope("device") monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.relaxed.gpu.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(1) %c syncscope("device") monotonic, align 4
 
-  ; CHECK: ld.relaxed.gpu.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(1) %d syncscope("device") monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.relaxed.gpu.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(1) %d syncscope("device") monotonic, align 8
 
-  ; CHECK: ld.relaxed.gpu.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(1) %e syncscope("device") monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.relaxed.gpu.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(1) %e syncscope("device") monotonic, align 4
 
-  ; CHECK: ld.relaxed.gpu.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(1) %e syncscope("device") monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.relaxed.gpu.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(1) %e syncscope("device") monotonic, align 8
 
   ret void
@@ -1133,40 +1609,59 @@ define void @global_monotonic_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr
 
 ; CHECK-LABEL: global_monotonic_volatile_gpu
 define void @global_monotonic_volatile_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: ld.mmio.relaxed.sys.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: global_monotonic_volatile_gpu(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_monotonic_volatile_gpu_param_0];
+; CHECK-NEXT:    ld.mmio.relaxed.sys.global.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [global_monotonic_volatile_gpu_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [global_monotonic_volatile_gpu_param_2];
+; CHECK-NEXT:    st.mmio.relaxed.sys.global.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [global_monotonic_volatile_gpu_param_3];
+; CHECK-NEXT:    ld.mmio.relaxed.sys.global.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [global_monotonic_volatile_gpu_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.mmio.relaxed.sys.global.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.mmio.relaxed.sys.global.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.mmio.relaxed.sys.global.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.mmio.relaxed.sys.global.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.mmio.relaxed.sys.global.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.mmio.relaxed.sys.global.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.mmio.relaxed.sys.global.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.mmio.relaxed.sys.global.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.mmio.relaxed.sys.global.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("device") monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.mmio.relaxed.sys.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("device") monotonic, align 1
 
-  ; CHECK: ld.mmio.relaxed.sys.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("device") monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.mmio.relaxed.sys.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("device") monotonic, align 2
 
-  ; CHECK: ld.mmio.relaxed.sys.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("device") monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.mmio.relaxed.sys.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("device") monotonic, align 4
 
-  ; CHECK: ld.mmio.relaxed.sys.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("device") monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.mmio.relaxed.sys.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("device") monotonic, align 8
 
-  ; CHECK: ld.mmio.relaxed.sys.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("device") monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.mmio.relaxed.sys.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("device") monotonic, align 4
 
-  ; CHECK: ld.mmio.relaxed.sys.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("device") monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.mmio.relaxed.sys.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("device") monotonic, align 8
 
   ret void
@@ -1174,40 +1669,59 @@ define void @global_monotonic_volatile_gpu(ptr addrspace(1) %a, ptr addrspace(1)
 
 ; CHECK-LABEL: global_monotonic_cta
 define void @global_monotonic_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: ld.relaxed.cta.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: global_monotonic_cta(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_monotonic_cta_param_0];
+; CHECK-NEXT:    ld.relaxed.cta.global.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [global_monotonic_cta_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [global_monotonic_cta_param_2];
+; CHECK-NEXT:    st.relaxed.cta.global.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [global_monotonic_cta_param_3];
+; CHECK-NEXT:    ld.relaxed.cta.global.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [global_monotonic_cta_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.relaxed.cta.global.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.relaxed.cta.global.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.relaxed.cta.global.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.relaxed.cta.global.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.relaxed.cta.global.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.relaxed.cta.global.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.relaxed.cta.global.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.relaxed.cta.global.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.relaxed.cta.global.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(1) %a syncscope("block") monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.relaxed.cta.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(1) %a syncscope("block") monotonic, align 1
 
-  ; CHECK: ld.relaxed.cta.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(1) %b syncscope("block") monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.relaxed.cta.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(1) %b syncscope("block") monotonic, align 2
 
-  ; CHECK: ld.relaxed.cta.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(1) %c syncscope("block") monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.relaxed.cta.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(1) %c syncscope("block") monotonic, align 4
 
-  ; CHECK: ld.relaxed.cta.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(1) %d syncscope("block") monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.relaxed.cta.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(1) %d syncscope("block") monotonic, align 8
 
-  ; CHECK: ld.relaxed.cta.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(1) %e syncscope("block") monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.relaxed.cta.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(1) %e syncscope("block") monotonic, align 4
 
-  ; CHECK: ld.relaxed.cta.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(1) %e syncscope("block") monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.relaxed.cta.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(1) %e syncscope("block") monotonic, align 8
 
   ret void
@@ -1215,40 +1729,59 @@ define void @global_monotonic_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr
 
 ; CHECK-LABEL: global_monotonic_volatile_cta
 define void @global_monotonic_volatile_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: ld.mmio.relaxed.sys.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: global_monotonic_volatile_cta(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_monotonic_volatile_cta_param_0];
+; CHECK-NEXT:    ld.mmio.relaxed.sys.global.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [global_monotonic_volatile_cta_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [global_monotonic_volatile_cta_param_2];
+; CHECK-NEXT:    st.mmio.relaxed.sys.global.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [global_monotonic_volatile_cta_param_3];
+; CHECK-NEXT:    ld.mmio.relaxed.sys.global.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [global_monotonic_volatile_cta_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.mmio.relaxed.sys.global.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.mmio.relaxed.sys.global.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.mmio.relaxed.sys.global.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.mmio.relaxed.sys.global.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.mmio.relaxed.sys.global.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.mmio.relaxed.sys.global.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.mmio.relaxed.sys.global.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.mmio.relaxed.sys.global.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.mmio.relaxed.sys.global.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("block") monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.mmio.relaxed.sys.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("block") monotonic, align 1
 
-  ; CHECK: ld.mmio.relaxed.sys.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("block") monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.mmio.relaxed.sys.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("block") monotonic, align 2
 
-  ; CHECK: ld.mmio.relaxed.sys.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("block") monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.mmio.relaxed.sys.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("block") monotonic, align 4
 
-  ; CHECK: ld.mmio.relaxed.sys.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("block") monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.mmio.relaxed.sys.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("block") monotonic, align 8
 
-  ; CHECK: ld.mmio.relaxed.sys.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("block") monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.mmio.relaxed.sys.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("block") monotonic, align 4
 
-  ; CHECK: ld.mmio.relaxed.sys.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("block") monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.mmio.relaxed.sys.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("block") monotonic, align 8
 
   ret void
@@ -1256,40 +1789,59 @@ define void @global_monotonic_volatile_cta(ptr addrspace(1) %a, ptr addrspace(1)
 
 ; CHECK-LABEL: global_acq_rel_sys
 define void @global_acq_rel_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.sys.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: global_acq_rel_sys(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_acq_rel_sys_param_0];
+; CHECK-NEXT:    ld.acquire.sys.global.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [global_acq_rel_sys_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [global_acq_rel_sys_param_2];
+; CHECK-NEXT:    st.release.sys.global.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [global_acq_rel_sys_param_3];
+; CHECK-NEXT:    ld.acquire.sys.global.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [global_acq_rel_sys_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.release.sys.global.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.acquire.sys.global.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.release.sys.global.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.acquire.sys.global.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.release.sys.global.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.acquire.sys.global.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.release.sys.global.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.acquire.sys.global.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.release.sys.global.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(1) %a acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.sys.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(1) %a release, align 1
 
-  ; CHECK: ld.acquire.sys.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(1) %b acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.sys.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(1) %b release, align 2
 
-  ; CHECK: ld.acquire.sys.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(1) %c acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(1) %c release, align 4
 
-  ; CHECK: ld.acquire.sys.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(1) %d acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(1) %d release, align 8
 
-  ; CHECK: ld.acquire.sys.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(1) %e acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(1) %e release, align 4
 
-  ; CHECK: ld.acquire.sys.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(1) %e acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(1) %e release, align 8
 
   ret void
@@ -1297,40 +1849,59 @@ define void @global_acq_rel_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr ad
 
 ; CHECK-LABEL: global_acq_rel_volatile_sys
 define void @global_acq_rel_volatile_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.sys.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: global_acq_rel_volatile_sys(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_acq_rel_volatile_sys_param_0];
+; CHECK-NEXT:    ld.acquire.sys.global.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [global_acq_rel_volatile_sys_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [global_acq_rel_volatile_sys_param_2];
+; CHECK-NEXT:    st.release.sys.global.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [global_acq_rel_volatile_sys_param_3];
+; CHECK-NEXT:    ld.acquire.sys.global.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [global_acq_rel_volatile_sys_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.release.sys.global.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.acquire.sys.global.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.release.sys.global.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.acquire.sys.global.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.release.sys.global.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.acquire.sys.global.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.release.sys.global.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.acquire.sys.global.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.release.sys.global.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(1) %a acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.sys.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(1) %a release, align 1
 
-  ; CHECK: ld.acquire.sys.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(1) %b acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.sys.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(1) %b release, align 2
 
-  ; CHECK: ld.acquire.sys.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(1) %c acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(1) %c release, align 4
 
-  ; CHECK: ld.acquire.sys.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(1) %d acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(1) %d release, align 8
 
-  ; CHECK: ld.acquire.sys.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(1) %e acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(1) %e release, align 4
 
-  ; CHECK: ld.acquire.sys.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(1) %e acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(1) %e release, align 8
 
   ret void
@@ -1338,40 +1909,59 @@ define void @global_acq_rel_volatile_sys(ptr addrspace(1) %a, ptr addrspace(1) %
 
 ; CHECK-LABEL: global_acq_rel_gpu
 define void @global_acq_rel_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.gpu.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: global_acq_rel_gpu(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_acq_rel_gpu_param_0];
+; CHECK-NEXT:    ld.acquire.gpu.global.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [global_acq_rel_gpu_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [global_acq_rel_gpu_param_2];
+; CHECK-NEXT:    st.release.gpu.global.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [global_acq_rel_gpu_param_3];
+; CHECK-NEXT:    ld.acquire.gpu.global.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [global_acq_rel_gpu_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.release.gpu.global.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.acquire.gpu.global.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.release.gpu.global.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.acquire.gpu.global.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.release.gpu.global.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.acquire.gpu.global.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.release.gpu.global.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.acquire.gpu.global.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.release.gpu.global.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(1) %a syncscope("device") acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.gpu.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(1) %a syncscope("device") release, align 1
 
-  ; CHECK: ld.acquire.gpu.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(1) %b syncscope("device") acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.gpu.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(1) %b syncscope("device") release, align 2
 
-  ; CHECK: ld.acquire.gpu.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(1) %c syncscope("device") acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.gpu.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(1) %c syncscope("device") release, align 4
 
-  ; CHECK: ld.acquire.gpu.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(1) %d syncscope("device") acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.gpu.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(1) %d syncscope("device") release, align 8
 
-  ; CHECK: ld.acquire.gpu.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(1) %e syncscope("device") acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.release.gpu.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(1) %e syncscope("device") release, align 4
 
-  ; CHECK: ld.acquire.gpu.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(1) %e syncscope("device") acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.gpu.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(1) %e syncscope("device") release, align 8
 
   ret void
@@ -1379,40 +1969,59 @@ define void @global_acq_rel_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr ad
 
 ; CHECK-LABEL: global_acq_rel_volatile_gpu
 define void @global_acq_rel_volatile_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.sys.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: global_acq_rel_volatile_gpu(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_acq_rel_volatile_gpu_param_0];
+; CHECK-NEXT:    ld.acquire.sys.global.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [global_acq_rel_volatile_gpu_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [global_acq_rel_volatile_gpu_param_2];
+; CHECK-NEXT:    st.release.sys.global.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [global_acq_rel_volatile_gpu_param_3];
+; CHECK-NEXT:    ld.acquire.sys.global.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [global_acq_rel_volatile_gpu_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.release.sys.global.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.acquire.sys.global.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.release.sys.global.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.acquire.sys.global.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.release.sys.global.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.acquire.sys.global.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.release.sys.global.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.acquire.sys.global.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.release.sys.global.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("device") acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.sys.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("device") release, align 1
 
-  ; CHECK: ld.acquire.sys.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("device") acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.sys.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("device") release, align 2
 
-  ; CHECK: ld.acquire.sys.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("device") acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("device") release, align 4
 
-  ; CHECK: ld.acquire.sys.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("device") acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("device") release, align 8
 
-  ; CHECK: ld.acquire.sys.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("device") acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("device") release, align 4
 
-  ; CHECK: ld.acquire.sys.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("device") acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("device") release, align 8
 
   ret void
@@ -1420,40 +2029,59 @@ define void @global_acq_rel_volatile_gpu(ptr addrspace(1) %a, ptr addrspace(1) %
 
 ; CHECK-LABEL: global_acq_rel_cta
 define void @global_acq_rel_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.cta.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: global_acq_rel_cta(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_acq_rel_cta_param_0];
+; CHECK-NEXT:    ld.acquire.cta.global.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [global_acq_rel_cta_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [global_acq_rel_cta_param_2];
+; CHECK-NEXT:    st.release.cta.global.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [global_acq_rel_cta_param_3];
+; CHECK-NEXT:    ld.acquire.cta.global.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [global_acq_rel_cta_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.release.cta.global.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.acquire.cta.global.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.release.cta.global.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.acquire.cta.global.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.release.cta.global.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.acquire.cta.global.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.release.cta.global.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.acquire.cta.global.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.release.cta.global.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(1) %a syncscope("block") acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.cta.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(1) %a syncscope("block") release, align 1
 
-  ; CHECK: ld.acquire.cta.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(1) %b syncscope("block") acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.cta.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(1) %b syncscope("block") release, align 2
 
-  ; CHECK: ld.acquire.cta.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(1) %c syncscope("block") acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.cta.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(1) %c syncscope("block") release, align 4
 
-  ; CHECK: ld.acquire.cta.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(1) %d syncscope("block") acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.cta.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(1) %d syncscope("block") release, align 8
 
-  ; CHECK: ld.acquire.cta.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(1) %e syncscope("block") acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.release.cta.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(1) %e syncscope("block") release, align 4
 
-  ; CHECK: ld.acquire.cta.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(1) %e syncscope("block") acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.cta.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(1) %e syncscope("block") release, align 8
 
   ret void
@@ -1461,40 +2089,59 @@ define void @global_acq_rel_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr ad
 
 ; CHECK-LABEL: global_acq_rel_volatile_cta
 define void @global_acq_rel_volatile_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.sys.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: global_acq_rel_volatile_cta(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_acq_rel_volatile_cta_param_0];
+; CHECK-NEXT:    ld.acquire.sys.global.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [global_acq_rel_volatile_cta_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [global_acq_rel_volatile_cta_param_2];
+; CHECK-NEXT:    st.release.sys.global.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [global_acq_rel_volatile_cta_param_3];
+; CHECK-NEXT:    ld.acquire.sys.global.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [global_acq_rel_volatile_cta_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.release.sys.global.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.acquire.sys.global.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.release.sys.global.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.acquire.sys.global.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.release.sys.global.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.acquire.sys.global.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.release.sys.global.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.acquire.sys.global.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.release.sys.global.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("block") acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.sys.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("block") release, align 1
 
-  ; CHECK: ld.acquire.sys.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("block") acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.sys.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("block") release, align 2
 
-  ; CHECK: ld.acquire.sys.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("block") acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("block") release, align 4
 
-  ; CHECK: ld.acquire.sys.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("block") acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("block") release, align 8
 
-  ; CHECK: ld.acquire.sys.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("block") acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("block") release, align 4
 
-  ; CHECK: ld.acquire.sys.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("block") acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("block") release, align 8
 
   ret void
@@ -1502,52 +2149,71 @@ define void @global_acq_rel_volatile_cta(ptr addrspace(1) %a, ptr addrspace(1) %
 
 ; CHECK-LABEL: global_seq_cst_sys
 define void @global_seq_cst_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: global_seq_cst_sys(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_seq_cst_sys_param_0];
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.global.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [global_seq_cst_sys_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [global_seq_cst_sys_param_2];
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.global.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [global_seq_cst_sys_param_3];
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.global.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [global_seq_cst_sys_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.global.b16 [%rd2], %rs4;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.global.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.global.b32 [%rd3], %r2;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.global.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.global.b64 [%rd4], %rd7;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.global.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.global.b32 [%rd5], %r4;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.global.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.global.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(1) %a seq_cst, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(1) %a seq_cst, align 1
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(1) %b seq_cst, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(1) %b seq_cst, align 2
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(1) %c seq_cst, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(1) %c seq_cst, align 4
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(1) %d seq_cst, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(1) %d seq_cst, align 8
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(1) %e seq_cst, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(1) %e seq_cst, align 4
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(1) %e seq_cst, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(1) %e seq_cst, align 8
 
   ret void
@@ -1555,52 +2221,71 @@ define void @global_seq_cst_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr ad
 
 ; CHECK-LABEL: global_seq_cst_volatile_sys
 define void @global_seq_cst_volatile_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: global_seq_cst_volatile_sys(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_seq_cst_volatile_sys_param_0];
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.global.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [global_seq_cst_volatile_sys_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [global_seq_cst_volatile_sys_param_2];
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.global.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [global_seq_cst_volatile_sys_param_3];
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.global.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [global_seq_cst_volatile_sys_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.global.b16 [%rd2], %rs4;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.global.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.global.b32 [%rd3], %r2;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.global.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.global.b64 [%rd4], %rd7;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.global.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.global.b32 [%rd5], %r4;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.global.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.global.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(1) %a seq_cst, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(1) %a seq_cst, align 1
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(1) %b seq_cst, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(1) %b seq_cst, align 2
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(1) %c seq_cst, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(1) %c seq_cst, align 4
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(1) %d seq_cst, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(1) %d seq_cst, align 8
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(1) %e seq_cst, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(1) %e seq_cst, align 4
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(1) %e seq_cst, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(1) %e seq_cst, align 8
 
   ret void
@@ -1608,52 +2293,71 @@ define void @global_seq_cst_volatile_sys(ptr addrspace(1) %a, ptr addrspace(1) %
 
 ; CHECK-LABEL: global_seq_cst_gpu
 define void @global_seq_cst_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: fence.sc.gpu
-  ; CHECK: ld.acquire.gpu.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: global_seq_cst_gpu(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_seq_cst_gpu_param_0];
+; CHECK-NEXT:    fence.sc.gpu;
+; CHECK-NEXT:    ld.acquire.gpu.global.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [global_seq_cst_gpu_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [global_seq_cst_gpu_param_2];
+; CHECK-NEXT:    fence.sc.gpu;
+; CHECK-NEXT:    st.release.gpu.global.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [global_seq_cst_gpu_param_3];
+; CHECK-NEXT:    fence.sc.gpu;
+; CHECK-NEXT:    ld.acquire.gpu.global.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [global_seq_cst_gpu_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    fence.sc.gpu;
+; CHECK-NEXT:    st.release.gpu.global.b16 [%rd2], %rs4;
+; CHECK-NEXT:    fence.sc.gpu;
+; CHECK-NEXT:    ld.acquire.gpu.global.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    fence.sc.gpu;
+; CHECK-NEXT:    st.release.gpu.global.b32 [%rd3], %r2;
+; CHECK-NEXT:    fence.sc.gpu;
+; CHECK-NEXT:    ld.acquire.gpu.global.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    fence.sc.gpu;
+; CHECK-NEXT:    st.release.gpu.global.b64 [%rd4], %rd7;
+; CHECK-NEXT:    fence.sc.gpu;
+; CHECK-NEXT:    ld.acquire.gpu.global.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    fence.sc.gpu;
+; CHECK-NEXT:    st.release.gpu.global.b32 [%rd5], %r4;
+; CHECK-NEXT:    fence.sc.gpu;
+; CHECK-NEXT:    ld.acquire.gpu.global.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    fence.sc.gpu;
+; CHECK-NEXT:    st.release.gpu.global.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(1) %a syncscope("device") seq_cst, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: fence.sc.gpu
-  ; CHECK: st.release.gpu.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(1) %a syncscope("device") seq_cst, align 1
 
-  ; CHECK: fence.sc.gpu
-  ; CHECK: ld.acquire.gpu.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(1) %b syncscope("device") seq_cst, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: fence.sc.gpu
-  ; CHECK: st.release.gpu.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(1) %b syncscope("device") seq_cst, align 2
 
-  ; CHECK: fence.sc.gpu
-  ; CHECK: ld.acquire.gpu.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(1) %c syncscope("device") seq_cst, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: fence.sc.gpu
-  ; CHECK: st.release.gpu.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(1) %c syncscope("device") seq_cst, align 4
 
-  ; CHECK: fence.sc.gpu
-  ; CHECK: ld.acquire.gpu.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(1) %d syncscope("device") seq_cst, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: fence.sc.gpu
-  ; CHECK: st.release.gpu.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(1) %d syncscope("device") seq_cst, align 8
 
-  ; CHECK: fence.sc.gpu
-  ; CHECK: ld.acquire.gpu.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(1) %e syncscope("device") seq_cst, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: fence.sc.gpu
-  ; CHECK: st.release.gpu.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(1) %e syncscope("device") seq_cst, align 4
 
-  ; CHECK: fence.sc.gpu
-  ; CHECK: ld.acquire.gpu.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(1) %e syncscope("device") seq_cst, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: fence.sc.gpu
-  ; CHECK: st.release.gpu.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(1) %e syncscope("device") seq_cst, align 8
 
   ret void
@@ -1661,52 +2365,71 @@ define void @global_seq_cst_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr ad
 
 ; CHECK-LABEL: global_seq_cst_volatile_gpu
 define void @global_seq_cst_volatile_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: global_seq_cst_volatile_gpu(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_seq_cst_volatile_gpu_param_0];
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.global.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [global_seq_cst_volatile_gpu_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [global_seq_cst_volatile_gpu_param_2];
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.global.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [global_seq_cst_volatile_gpu_param_3];
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.global.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [global_seq_cst_volatile_gpu_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.global.b16 [%rd2], %rs4;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.global.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.global.b32 [%rd3], %r2;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.global.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.global.b64 [%rd4], %rd7;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.global.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.global.b32 [%rd5], %r4;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.global.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.global.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("device") seq_cst, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("device") seq_cst, align 1
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("device") seq_cst, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("device") seq_cst, align 2
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("device") seq_cst, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("device") seq_cst, align 4
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("device") seq_cst, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("device") seq_cst, align 8
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("device") seq_cst, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("device") seq_cst, align 4
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("device") seq_cst, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("device") seq_cst, align 8
 
   ret void
@@ -1714,52 +2437,71 @@ define void @global_seq_cst_volatile_gpu(ptr addrspace(1) %a, ptr addrspace(1) %
 
 ; CHECK-LABEL: global_seq_cst_cta
 define void @global_seq_cst_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: fence.sc.cta
-  ; CHECK: ld.acquire.cta.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: global_seq_cst_cta(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_seq_cst_cta_param_0];
+; CHECK-NEXT:    fence.sc.cta;
+; CHECK-NEXT:    ld.acquire.cta.global.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [global_seq_cst_cta_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [global_seq_cst_cta_param_2];
+; CHECK-NEXT:    fence.sc.cta;
+; CHECK-NEXT:    st.release.cta.global.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [global_seq_cst_cta_param_3];
+; CHECK-NEXT:    fence.sc.cta;
+; CHECK-NEXT:    ld.acquire.cta.global.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [global_seq_cst_cta_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    fence.sc.cta;
+; CHECK-NEXT:    st.release.cta.global.b16 [%rd2], %rs4;
+; CHECK-NEXT:    fence.sc.cta;
+; CHECK-NEXT:    ld.acquire.cta.global.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    fence.sc.cta;
+; CHECK-NEXT:    st.release.cta.global.b32 [%rd3], %r2;
+; CHECK-NEXT:    fence.sc.cta;
+; CHECK-NEXT:    ld.acquire.cta.global.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    fence.sc.cta;
+; CHECK-NEXT:    st.release.cta.global.b64 [%rd4], %rd7;
+; CHECK-NEXT:    fence.sc.cta;
+; CHECK-NEXT:    ld.acquire.cta.global.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    fence.sc.cta;
+; CHECK-NEXT:    st.release.cta.global.b32 [%rd5], %r4;
+; CHECK-NEXT:    fence.sc.cta;
+; CHECK-NEXT:    ld.acquire.cta.global.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    fence.sc.cta;
+; CHECK-NEXT:    st.release.cta.global.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(1) %a syncscope("block") seq_cst, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: fence.sc.cta
-  ; CHECK: st.release.cta.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(1) %a syncscope("block") seq_cst, align 1
 
-  ; CHECK: fence.sc.cta
-  ; CHECK: ld.acquire.cta.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(1) %b syncscope("block") seq_cst, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: fence.sc.cta
-  ; CHECK: st.release.cta.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(1) %b syncscope("block") seq_cst, align 2
 
-  ; CHECK: fence.sc.cta
-  ; CHECK: ld.acquire.cta.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(1) %c syncscope("block") seq_cst, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: fence.sc.cta
-  ; CHECK: st.release.cta.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(1) %c syncscope("block") seq_cst, align 4
 
-  ; CHECK: fence.sc.cta
-  ; CHECK: ld.acquire.cta.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(1) %d syncscope("block") seq_cst, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: fence.sc.cta
-  ; CHECK: st.release.cta.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(1) %d syncscope("block") seq_cst, align 8
 
-  ; CHECK: fence.sc.cta
-  ; CHECK: ld.acquire.cta.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(1) %e syncscope("block") seq_cst, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: fence.sc.cta
-  ; CHECK: st.release.cta.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(1) %e syncscope("block") seq_cst, align 4
 
-  ; CHECK: fence.sc.cta
-  ; CHECK: ld.acquire.cta.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(1) %e syncscope("block") seq_cst, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: fence.sc.cta
-  ; CHECK: st.release.cta.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(1) %e syncscope("block") seq_cst, align 8
 
   ret void
@@ -1767,52 +2509,71 @@ define void @global_seq_cst_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr ad
 
 ; CHECK-LABEL: global_seq_cst_volatile_cta
 define void @global_seq_cst_volatile_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: global_seq_cst_volatile_cta(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_seq_cst_volatile_cta_param_0];
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.global.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [global_seq_cst_volatile_cta_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [global_seq_cst_volatile_cta_param_2];
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.global.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [global_seq_cst_volatile_cta_param_3];
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.global.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [global_seq_cst_volatile_cta_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.global.b16 [%rd2], %rs4;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.global.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.global.b32 [%rd3], %r2;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.global.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.global.b64 [%rd4], %rd7;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.global.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.global.b32 [%rd5], %r4;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.global.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.global.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("block") seq_cst, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("block") seq_cst, align 1
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("block") seq_cst, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("block") seq_cst, align 2
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("block") seq_cst, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("block") seq_cst, align 4
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("block") seq_cst, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("block") seq_cst, align 8
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("block") seq_cst, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("block") seq_cst, align 4
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("block") seq_cst, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("block") seq_cst, align 8
 
   ret void
@@ -1822,40 +2583,59 @@ define void @global_seq_cst_volatile_cta(ptr addrspace(1) %a, ptr addrspace(1) %
 
 ; CHECK-LABEL: shared_unordered_gpu
 define void @shared_unordered_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: ld.relaxed.gpu.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: shared_unordered_gpu(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_unordered_gpu_param_0];
+; CHECK-NEXT:    ld.relaxed.gpu.shared.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [shared_unordered_gpu_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [shared_unordered_gpu_param_2];
+; CHECK-NEXT:    st.relaxed.gpu.shared.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [shared_unordered_gpu_param_3];
+; CHECK-NEXT:    ld.relaxed.gpu.shared.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [shared_unordered_gpu_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.relaxed.gpu.shared.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.relaxed.gpu.shared.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.relaxed.gpu.shared.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.relaxed.gpu.shared.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.relaxed.gpu.shared.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.relaxed.gpu.shared.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.relaxed.gpu.shared.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.relaxed.gpu.shared.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.relaxed.gpu.shared.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(3) %a syncscope("device") unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.relaxed.gpu.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(3) %a syncscope("device") unordered, align 1
 
-  ; CHECK: ld.relaxed.gpu.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(3) %b syncscope("device") unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.relaxed.gpu.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(3) %b syncscope("device") unordered, align 2
 
-  ; CHECK: ld.relaxed.gpu.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(3) %c syncscope("device") unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.relaxed.gpu.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(3) %c syncscope("device") unordered, align 4
 
-  ; CHECK: ld.relaxed.gpu.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(3) %d syncscope("device") unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.relaxed.gpu.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(3) %d syncscope("device") unordered, align 8
 
-  ; CHECK: ld.relaxed.gpu.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(3) %e syncscope("device") unordered, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.relaxed.gpu.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(3) %e syncscope("device") unordered, align 4
 
-  ; CHECK: ld.relaxed.gpu.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(3) %e syncscope("device") unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.relaxed.gpu.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(3) %e syncscope("device") unordered, align 8
 
   ret void
@@ -1863,40 +2643,59 @@ define void @shared_unordered_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr
 
 ; CHECK-LABEL: shared_unordered_volatile_gpu
 define void @shared_unordered_volatile_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: ld.volatile.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: shared_unordered_volatile_gpu(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_unordered_volatile_gpu_param_0];
+; CHECK-NEXT:    ld.volatile.shared.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [shared_unordered_volatile_gpu_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [shared_unordered_volatile_gpu_param_2];
+; CHECK-NEXT:    st.volatile.shared.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [shared_unordered_volatile_gpu_param_3];
+; CHECK-NEXT:    ld.volatile.shared.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [shared_unordered_volatile_gpu_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.volatile.shared.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.volatile.shared.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.volatile.shared.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.volatile.shared.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.volatile.shared.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.volatile.shared.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.volatile.shared.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.volatile.shared.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.volatile.shared.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("device") unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.volatile.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("device") unordered, align 1
 
-  ; CHECK: ld.volatile.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("device") unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.volatile.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("device") unordered, align 2
 
-  ; CHECK: ld.volatile.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("device") unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.volatile.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("device") unordered, align 4
 
-  ; CHECK: ld.volatile.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("device") unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.volatile.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("device") unordered, align 8
 
-  ; CHECK: ld.volatile.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("device") unordered, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.volatile.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("device") unordered, align 4
 
-  ; CHECK: ld.volatile.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("device") unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.volatile.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("device") unordered, align 8
 
   ret void
@@ -1904,40 +2703,59 @@ define void @shared_unordered_volatile_gpu(ptr addrspace(3) %a, ptr addrspace(3)
 
 ; CHECK-LABEL: shared_unordered_cta
 define void @shared_unordered_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: ld.relaxed.cta.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: shared_unordered_cta(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_unordered_cta_param_0];
+; CHECK-NEXT:    ld.relaxed.cta.shared.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [shared_unordered_cta_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [shared_unordered_cta_param_2];
+; CHECK-NEXT:    st.relaxed.cta.shared.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [shared_unordered_cta_param_3];
+; CHECK-NEXT:    ld.relaxed.cta.shared.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [shared_unordered_cta_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.relaxed.cta.shared.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.relaxed.cta.shared.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.relaxed.cta.shared.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.relaxed.cta.shared.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.relaxed.cta.shared.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.relaxed.cta.shared.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.relaxed.cta.shared.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.relaxed.cta.shared.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.relaxed.cta.shared.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(3) %a syncscope("block") unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.relaxed.cta.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(3) %a syncscope("block") unordered, align 1
 
-  ; CHECK: ld.relaxed.cta.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(3) %b syncscope("block") unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.relaxed.cta.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(3) %b syncscope("block") unordered, align 2
 
-  ; CHECK: ld.relaxed.cta.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(3) %c syncscope("block") unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.relaxed.cta.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(3) %c syncscope("block") unordered, align 4
 
-  ; CHECK: ld.relaxed.cta.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(3) %d syncscope("block") unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.relaxed.cta.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(3) %d syncscope("block") unordered, align 8
 
-  ; CHECK: ld.relaxed.cta.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(3) %e syncscope("block") unordered, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.relaxed.cta.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(3) %e syncscope("block") unordered, align 4
 
-  ; CHECK: ld.relaxed.cta.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(3) %e syncscope("block") unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.relaxed.cta.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(3) %e syncscope("block") unordered, align 8
 
   ret void
@@ -1945,40 +2763,59 @@ define void @shared_unordered_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr
 
 ; CHECK-LABEL: shared_unordered_volatile_cta
 define void @shared_unordered_volatile_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: ld.volatile.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: shared_unordered_volatile_cta(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_unordered_volatile_cta_param_0];
+; CHECK-NEXT:    ld.volatile.shared.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [shared_unordered_volatile_cta_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [shared_unordered_volatile_cta_param_2];
+; CHECK-NEXT:    st.volatile.shared.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [shared_unordered_volatile_cta_param_3];
+; CHECK-NEXT:    ld.volatile.shared.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [shared_unordered_volatile_cta_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.volatile.shared.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.volatile.shared.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.volatile.shared.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.volatile.shared.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.volatile.shared.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.volatile.shared.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.volatile.shared.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.volatile.shared.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.volatile.shared.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("block") unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.volatile.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("block") unordered, align 1
 
-  ; CHECK: ld.volatile.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("block") unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.volatile.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("block") unordered, align 2
 
-  ; CHECK: ld.volatile.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("block") unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.volatile.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("block") unordered, align 4
 
-  ; CHECK: ld.volatile.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("block") unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.volatile.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("block") unordered, align 8
 
-  ; CHECK: ld.volatile.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("block") unordered, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.volatile.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("block") unordered, align 4
 
-  ; CHECK: ld.volatile.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("block") unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.volatile.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("block") unordered, align 8
 
   ret void
@@ -1986,40 +2823,59 @@ define void @shared_unordered_volatile_cta(ptr addrspace(3) %a, ptr addrspace(3)
 
 ; CHECK-LABEL: shared_monotonic_gpu
 define void @shared_monotonic_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: ld.relaxed.gpu.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: shared_monotonic_gpu(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_monotonic_gpu_param_0];
+; CHECK-NEXT:    ld.relaxed.gpu.shared.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [shared_monotonic_gpu_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [shared_monotonic_gpu_param_2];
+; CHECK-NEXT:    st.relaxed.gpu.shared.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [shared_monotonic_gpu_param_3];
+; CHECK-NEXT:    ld.relaxed.gpu.shared.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [shared_monotonic_gpu_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.relaxed.gpu.shared.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.relaxed.gpu.shared.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.relaxed.gpu.shared.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.relaxed.gpu.shared.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.relaxed.gpu.shared.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.relaxed.gpu.shared.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.relaxed.gpu.shared.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.relaxed.gpu.shared.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.relaxed.gpu.shared.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(3) %a syncscope("device") monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.relaxed.gpu.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(3) %a syncscope("device") monotonic, align 1
 
-  ; CHECK: ld.relaxed.gpu.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(3) %b syncscope("device") monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.relaxed.gpu.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(3) %b syncscope("device") monotonic, align 2
 
-  ; CHECK: ld.relaxed.gpu.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(3) %c syncscope("device") monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.relaxed.gpu.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(3) %c syncscope("device") monotonic, align 4
 
-  ; CHECK: ld.relaxed.gpu.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(3) %d syncscope("device") monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.relaxed.gpu.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(3) %d syncscope("device") monotonic, align 8
 
-  ; CHECK: ld.relaxed.gpu.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(3) %e syncscope("device") monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.relaxed.gpu.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(3) %e syncscope("device") monotonic, align 4
 
-  ; CHECK: ld.relaxed.gpu.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(3) %e syncscope("device") monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.relaxed.gpu.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(3) %e syncscope("device") monotonic, align 8
 
   ret void
@@ -2027,40 +2883,59 @@ define void @shared_monotonic_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr
 
 ; CHECK-LABEL: shared_monotonic_volatile_gpu
 define void @shared_monotonic_volatile_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: ld.volatile.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: shared_monotonic_volatile_gpu(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_monotonic_volatile_gpu_param_0];
+; CHECK-NEXT:    ld.volatile.shared.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [shared_monotonic_volatile_gpu_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [shared_monotonic_volatile_gpu_param_2];
+; CHECK-NEXT:    st.volatile.shared.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [shared_monotonic_volatile_gpu_param_3];
+; CHECK-NEXT:    ld.volatile.shared.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [shared_monotonic_volatile_gpu_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.volatile.shared.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.volatile.shared.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.volatile.shared.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.volatile.shared.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.volatile.shared.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.volatile.shared.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.volatile.shared.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.volatile.shared.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.volatile.shared.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("device") monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.volatile.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("device") monotonic, align 1
 
-  ; CHECK: ld.volatile.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("device") monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.volatile.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("device") monotonic, align 2
 
-  ; CHECK: ld.volatile.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("device") monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.volatile.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("device") monotonic, align 4
 
-  ; CHECK: ld.volatile.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("device") monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.volatile.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("device") monotonic, align 8
 
-  ; CHECK: ld.volatile.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("device") monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.volatile.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("device") monotonic, align 4
 
-  ; CHECK: ld.volatile.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("device") monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.volatile.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("device") monotonic, align 8
 
   ret void
@@ -2068,40 +2943,59 @@ define void @shared_monotonic_volatile_gpu(ptr addrspace(3) %a, ptr addrspace(3)
 
 ; CHECK-LABEL: shared_monotonic_cta
 define void @shared_monotonic_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: ld.relaxed.cta.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: shared_monotonic_cta(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_monotonic_cta_param_0];
+; CHECK-NEXT:    ld.relaxed.cta.shared.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [shared_monotonic_cta_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [shared_monotonic_cta_param_2];
+; CHECK-NEXT:    st.relaxed.cta.shared.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [shared_monotonic_cta_param_3];
+; CHECK-NEXT:    ld.relaxed.cta.shared.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [shared_monotonic_cta_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.relaxed.cta.shared.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.relaxed.cta.shared.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.relaxed.cta.shared.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.relaxed.cta.shared.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.relaxed.cta.shared.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.relaxed.cta.shared.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.relaxed.cta.shared.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.relaxed.cta.shared.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.relaxed.cta.shared.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(3) %a syncscope("block") monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.relaxed.cta.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(3) %a syncscope("block") monotonic, align 1
 
-  ; CHECK: ld.relaxed.cta.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(3) %b syncscope("block") monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.relaxed.cta.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(3) %b syncscope("block") monotonic, align 2
 
-  ; CHECK: ld.relaxed.cta.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(3) %c syncscope("block") monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.relaxed.cta.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(3) %c syncscope("block") monotonic, align 4
 
-  ; CHECK: ld.relaxed.cta.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(3) %d syncscope("block") monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.relaxed.cta.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(3) %d syncscope("block") monotonic, align 8
 
-  ; CHECK: ld.relaxed.cta.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(3) %e syncscope("block") monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.relaxed.cta.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(3) %e syncscope("block") monotonic, align 4
 
-  ; CHECK: ld.relaxed.cta.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(3) %e syncscope("block") monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.relaxed.cta.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(3) %e syncscope("block") monotonic, align 8
 
   ret void
@@ -2109,40 +3003,59 @@ define void @shared_monotonic_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr
 
 ; CHECK-LABEL: shared_monotonic_volatile_cta
 define void @shared_monotonic_volatile_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: ld.volatile.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: shared_monotonic_volatile_cta(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_monotonic_volatile_cta_param_0];
+; CHECK-NEXT:    ld.volatile.shared.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [shared_monotonic_volatile_cta_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [shared_monotonic_volatile_cta_param_2];
+; CHECK-NEXT:    st.volatile.shared.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [shared_monotonic_volatile_cta_param_3];
+; CHECK-NEXT:    ld.volatile.shared.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [shared_monotonic_volatile_cta_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.volatile.shared.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.volatile.shared.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.volatile.shared.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.volatile.shared.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.volatile.shared.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.volatile.shared.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.volatile.shared.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.volatile.shared.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.volatile.shared.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("block") monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.volatile.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("block") monotonic, align 1
 
-  ; CHECK: ld.volatile.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("block") monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.volatile.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("block") monotonic, align 2
 
-  ; CHECK: ld.volatile.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("block") monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.volatile.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("block") monotonic, align 4
 
-  ; CHECK: ld.volatile.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("block") monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.volatile.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("block") monotonic, align 8
 
-  ; CHECK: ld.volatile.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("block") monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.volatile.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("block") monotonic, align 4
 
-  ; CHECK: ld.volatile.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("block") monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.volatile.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("block") monotonic, align 8
 
   ret void
@@ -2150,40 +3063,59 @@ define void @shared_monotonic_volatile_cta(ptr addrspace(3) %a, ptr addrspace(3)
 
 ; CHECK-LABEL: shared_acq_rel_sys
 define void @shared_acq_rel_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.sys.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: shared_acq_rel_sys(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_acq_rel_sys_param_0];
+; CHECK-NEXT:    ld.acquire.sys.shared.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [shared_acq_rel_sys_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [shared_acq_rel_sys_param_2];
+; CHECK-NEXT:    st.release.sys.shared.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [shared_acq_rel_sys_param_3];
+; CHECK-NEXT:    ld.acquire.sys.shared.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [shared_acq_rel_sys_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.release.sys.shared.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.acquire.sys.shared.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.release.sys.shared.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.acquire.sys.shared.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.release.sys.shared.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.acquire.sys.shared.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.release.sys.shared.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.acquire.sys.shared.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.release.sys.shared.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(3) %a acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.sys.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(3) %a release, align 1
 
-  ; CHECK: ld.acquire.sys.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(3) %b acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.sys.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(3) %b release, align 2
 
-  ; CHECK: ld.acquire.sys.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(3) %c acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(3) %c release, align 4
 
-  ; CHECK: ld.acquire.sys.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(3) %d acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(3) %d release, align 8
 
-  ; CHECK: ld.acquire.sys.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(3) %e acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(3) %e release, align 4
 
-  ; CHECK: ld.acquire.sys.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(3) %e acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(3) %e release, align 8
 
   ret void
@@ -2191,40 +3123,59 @@ define void @shared_acq_rel_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr ad
 
 ; CHECK-LABEL: shared_acq_rel_volatile_sys
 define void @shared_acq_rel_volatile_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.sys.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: shared_acq_rel_volatile_sys(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_acq_rel_volatile_sys_param_0];
+; CHECK-NEXT:    ld.acquire.sys.shared.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [shared_acq_rel_volatile_sys_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [shared_acq_rel_volatile_sys_param_2];
+; CHECK-NEXT:    st.release.sys.shared.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [shared_acq_rel_volatile_sys_param_3];
+; CHECK-NEXT:    ld.acquire.sys.shared.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [shared_acq_rel_volatile_sys_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.release.sys.shared.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.acquire.sys.shared.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.release.sys.shared.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.acquire.sys.shared.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.release.sys.shared.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.acquire.sys.shared.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.release.sys.shared.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.acquire.sys.shared.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.release.sys.shared.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(3) %a acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.sys.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(3) %a release, align 1
 
-  ; CHECK: ld.acquire.sys.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(3) %b acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.sys.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(3) %b release, align 2
 
-  ; CHECK: ld.acquire.sys.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(3) %c acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(3) %c release, align 4
 
-  ; CHECK: ld.acquire.sys.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(3) %d acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(3) %d release, align 8
 
-  ; CHECK: ld.acquire.sys.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(3) %e acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(3) %e release, align 4
 
-  ; CHECK: ld.acquire.sys.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(3) %e acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(3) %e release, align 8
 
   ret void
@@ -2232,40 +3183,59 @@ define void @shared_acq_rel_volatile_sys(ptr addrspace(3) %a, ptr addrspace(3) %
 
 ; CHECK-LABEL: shared_acq_rel_gpu
 define void @shared_acq_rel_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.gpu.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: shared_acq_rel_gpu(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_acq_rel_gpu_param_0];
+; CHECK-NEXT:    ld.acquire.gpu.shared.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [shared_acq_rel_gpu_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [shared_acq_rel_gpu_param_2];
+; CHECK-NEXT:    st.release.gpu.shared.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [shared_acq_rel_gpu_param_3];
+; CHECK-NEXT:    ld.acquire.gpu.shared.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [shared_acq_rel_gpu_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.release.gpu.shared.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.acquire.gpu.shared.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.release.gpu.shared.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.acquire.gpu.shared.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.release.gpu.shared.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.acquire.gpu.shared.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.release.gpu.shared.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.acquire.gpu.shared.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.release.gpu.shared.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(3) %a syncscope("device") acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.gpu.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(3) %a syncscope("device") release, align 1
 
-  ; CHECK: ld.acquire.gpu.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(3) %b syncscope("device") acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.gpu.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(3) %b syncscope("device") release, align 2
 
-  ; CHECK: ld.acquire.gpu.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(3) %c syncscope("device") acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.gpu.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(3) %c syncscope("device") release, align 4
 
-  ; CHECK: ld.acquire.gpu.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(3) %d syncscope("device") acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.gpu.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(3) %d syncscope("device") release, align 8
 
-  ; CHECK: ld.acquire.gpu.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(3) %e syncscope("device") acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.release.gpu.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(3) %e syncscope("device") release, align 4
 
-  ; CHECK: ld.acquire.gpu.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(3) %e syncscope("device") acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.gpu.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(3) %e syncscope("device") release, align 8
 
   ret void
@@ -2273,40 +3243,59 @@ define void @shared_acq_rel_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr ad
 
 ; CHECK-LABEL: shared_acq_rel_volatile_gpu
 define void @shared_acq_rel_volatile_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.sys.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: shared_acq_rel_volatile_gpu(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_acq_rel_volatile_gpu_param_0];
+; CHECK-NEXT:    ld.acquire.sys.shared.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [shared_acq_rel_volatile_gpu_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [shared_acq_rel_volatile_gpu_param_2];
+; CHECK-NEXT:    st.release.sys.shared.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [shared_acq_rel_volatile_gpu_param_3];
+; CHECK-NEXT:    ld.acquire.sys.shared.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [shared_acq_rel_volatile_gpu_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.release.sys.shared.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.acquire.sys.shared.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.release.sys.shared.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.acquire.sys.shared.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.release.sys.shared.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.acquire.sys.shared.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.release.sys.shared.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.acquire.sys.shared.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.release.sys.shared.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("device") acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.sys.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("device") release, align 1
 
-  ; CHECK: ld.acquire.sys.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("device") acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.sys.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("device") release, align 2
 
-  ; CHECK: ld.acquire.sys.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("device") acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("device") release, align 4
 
-  ; CHECK: ld.acquire.sys.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("device") acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("device") release, align 8
 
-  ; CHECK: ld.acquire.sys.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("device") acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("device") release, align 4
 
-  ; CHECK: ld.acquire.sys.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("device") acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("device") release, align 8
 
   ret void
@@ -2314,40 +3303,59 @@ define void @shared_acq_rel_volatile_gpu(ptr addrspace(3) %a, ptr addrspace(3) %
 
 ; CHECK-LABEL: shared_acq_rel_cta
 define void @shared_acq_rel_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.cta.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: shared_acq_rel_cta(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_acq_rel_cta_param_0];
+; CHECK-NEXT:    ld.acquire.cta.shared.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [shared_acq_rel_cta_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [shared_acq_rel_cta_param_2];
+; CHECK-NEXT:    st.release.cta.shared.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [shared_acq_rel_cta_param_3];
+; CHECK-NEXT:    ld.acquire.cta.shared.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [shared_acq_rel_cta_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.release.cta.shared.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.acquire.cta.shared.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.release.cta.shared.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.acquire.cta.shared.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.release.cta.shared.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.acquire.cta.shared.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.release.cta.shared.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.acquire.cta.shared.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.release.cta.shared.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(3) %a syncscope("block") acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.cta.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(3) %a syncscope("block") release, align 1
 
-  ; CHECK: ld.acquire.cta.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(3) %b syncscope("block") acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.cta.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(3) %b syncscope("block") release, align 2
 
-  ; CHECK: ld.acquire.cta.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(3) %c syncscope("block") acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.cta.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(3) %c syncscope("block") release, align 4
 
-  ; CHECK: ld.acquire.cta.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(3) %d syncscope("block") acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.cta.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(3) %d syncscope("block") release, align 8
 
-  ; CHECK: ld.acquire.cta.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(3) %e syncscope("block") acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.release.cta.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(3) %e syncscope("block") release, align 4
 
-  ; CHECK: ld.acquire.cta.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(3) %e syncscope("block") acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.cta.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(3) %e syncscope("block") release, align 8
 
   ret void
@@ -2355,40 +3363,59 @@ define void @shared_acq_rel_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr ad
 
 ; CHECK-LABEL: shared_acq_rel_volatile_cta
 define void @shared_acq_rel_volatile_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.sys.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: shared_acq_rel_volatile_cta(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_acq_rel_volatile_cta_param_0];
+; CHECK-NEXT:    ld.acquire.sys.shared.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [shared_acq_rel_volatile_cta_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [shared_acq_rel_volatile_cta_param_2];
+; CHECK-NEXT:    st.release.sys.shared.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [shared_acq_rel_volatile_cta_param_3];
+; CHECK-NEXT:    ld.acquire.sys.shared.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [shared_acq_rel_volatile_cta_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.release.sys.shared.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.acquire.sys.shared.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.release.sys.shared.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.acquire.sys.shared.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.release.sys.shared.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.acquire.sys.shared.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.release.sys.shared.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.acquire.sys.shared.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.release.sys.shared.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("block") acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.sys.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("block") release, align 1
 
-  ; CHECK: ld.acquire.sys.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("block") acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.sys.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("block") release, align 2
 
-  ; CHECK: ld.acquire.sys.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("block") acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("block") release, align 4
 
-  ; CHECK: ld.acquire.sys.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("block") acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("block") release, align 8
 
-  ; CHECK: ld.acquire.sys.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("block") acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("block") release, align 4
 
-  ; CHECK: ld.acquire.sys.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("block") acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("block") release, align 8
 
   ret void
@@ -2396,52 +3423,71 @@ define void @shared_acq_rel_volatile_cta(ptr addrspace(3) %a, ptr addrspace(3) %
 
 ; CHECK-LABEL: shared_seq_cst_sys
 define void @shared_seq_cst_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: shared_seq_cst_sys(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_seq_cst_sys_param_0];
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.shared.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [shared_seq_cst_sys_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [shared_seq_cst_sys_param_2];
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.shared.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [shared_seq_cst_sys_param_3];
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.shared.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [shared_seq_cst_sys_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.shared.b16 [%rd2], %rs4;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.shared.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.shared.b32 [%rd3], %r2;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.shared.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.shared.b64 [%rd4], %rd7;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.shared.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.shared.b32 [%rd5], %r4;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.shared.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.shared.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(3) %a seq_cst, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(3) %a seq_cst, align 1
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(3) %b seq_cst, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(3) %b seq_cst, align 2
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(3) %c seq_cst, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(3) %c seq_cst, align 4
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(3) %d seq_cst, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(3) %d seq_cst, align 8
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(3) %e seq_cst, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(3) %e seq_cst, align 4
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(3) %e seq_cst, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(3) %e seq_cst, align 8
 
   ret void
@@ -2449,52 +3495,71 @@ define void @shared_seq_cst_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr ad
 
 ; CHECK-LABEL: shared_seq_cst_volatile_sys
 define void @shared_seq_cst_volatile_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: shared_seq_cst_volatile_sys(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_seq_cst_volatile_sys_param_0];
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.shared.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [shared_seq_cst_volatile_sys_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [shared_seq_cst_volatile_sys_param_2];
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.shared.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [shared_seq_cst_volatile_sys_param_3];
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.shared.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [shared_seq_cst_volatile_sys_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.shared.b16 [%rd2], %rs4;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.shared.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.shared.b32 [%rd3], %r2;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.shared.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.shared.b64 [%rd4], %rd7;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.shared.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.shared.b32 [%rd5], %r4;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.shared.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.shared.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(3) %a seq_cst, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(3) %a seq_cst, align 1
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(3) %b seq_cst, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(3) %b seq_cst, align 2
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(3) %c seq_cst, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(3) %c seq_cst, align 4
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(3) %d seq_cst, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(3) %d seq_cst, align 8
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(3) %e seq_cst, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(3) %e seq_cst, align 4
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(3) %e seq_cst, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(3) %e seq_cst, align 8
 
   ret void
@@ -2502,52 +3567,71 @@ define void @shared_seq_cst_volatile_sys(ptr addrspace(3) %a, ptr addrspace(3) %
 
 ; CHECK-LABEL: shared_seq_cst_gpu
 define void @shared_seq_cst_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: fence.sc.gpu
-  ; CHECK: ld.acquire.gpu.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: shared_seq_cst_gpu(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_seq_cst_gpu_param_0];
+; CHECK-NEXT:    fence.sc.gpu;
+; CHECK-NEXT:    ld.acquire.gpu.shared.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [shared_seq_cst_gpu_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [shared_seq_cst_gpu_param_2];
+; CHECK-NEXT:    fence.sc.gpu;
+; CHECK-NEXT:    st.release.gpu.shared.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [shared_seq_cst_gpu_param_3];
+; CHECK-NEXT:    fence.sc.gpu;
+; CHECK-NEXT:    ld.acquire.gpu.shared.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [shared_seq_cst_gpu_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    fence.sc.gpu;
+; CHECK-NEXT:    st.release.gpu.shared.b16 [%rd2], %rs4;
+; CHECK-NEXT:    fence.sc.gpu;
+; CHECK-NEXT:    ld.acquire.gpu.shared.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    fence.sc.gpu;
+; CHECK-NEXT:    st.release.gpu.shared.b32 [%rd3], %r2;
+; CHECK-NEXT:    fence.sc.gpu;
+; CHECK-NEXT:    ld.acquire.gpu.shared.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    fence.sc.gpu;
+; CHECK-NEXT:    st.release.gpu.shared.b64 [%rd4], %rd7;
+; CHECK-NEXT:    fence.sc.gpu;
+; CHECK-NEXT:    ld.acquire.gpu.shared.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    fence.sc.gpu;
+; CHECK-NEXT:    st.release.gpu.shared.b32 [%rd5], %r4;
+; CHECK-NEXT:    fence.sc.gpu;
+; CHECK-NEXT:    ld.acquire.gpu.shared.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    fence.sc.gpu;
+; CHECK-NEXT:    st.release.gpu.shared.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(3) %a syncscope("device") seq_cst, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: fence.sc.gpu
-  ; CHECK: st.release.gpu.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(3) %a syncscope("device") seq_cst, align 1
 
-  ; CHECK: fence.sc.gpu
-  ; CHECK: ld.acquire.gpu.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(3) %b syncscope("device") seq_cst, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: fence.sc.gpu
-  ; CHECK: st.release.gpu.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(3) %b syncscope("device") seq_cst, align 2
 
-  ; CHECK: fence.sc.gpu
-  ; CHECK: ld.acquire.gpu.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(3) %c syncscope("device") seq_cst, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: fence.sc.gpu
-  ; CHECK: st.release.gpu.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(3) %c syncscope("device") seq_cst, align 4
 
-  ; CHECK: fence.sc.gpu
-  ; CHECK: ld.acquire.gpu.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(3) %d syncscope("device") seq_cst, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: fence.sc.gpu
-  ; CHECK: st.release.gpu.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(3) %d syncscope("device") seq_cst, align 8
 
-  ; CHECK: fence.sc.gpu
-  ; CHECK: ld.acquire.gpu.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(3) %e syncscope("device") seq_cst, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: fence.sc.gpu
-  ; CHECK: st.release.gpu.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(3) %e syncscope("device") seq_cst, align 4
 
-  ; CHECK: fence.sc.gpu
-  ; CHECK: ld.acquire.gpu.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(3) %e syncscope("device") seq_cst, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: fence.sc.gpu
-  ; CHECK: st.release.gpu.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(3) %e syncscope("device") seq_cst, align 8
 
   ret void
@@ -2555,52 +3639,71 @@ define void @shared_seq_cst_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr ad
 
 ; CHECK-LABEL: shared_seq_cst_volatile_gpu
 define void @shared_seq_cst_volatile_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: shared_seq_cst_volatile_gpu(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_seq_cst_volatile_gpu_param_0];
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.shared.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [shared_seq_cst_volatile_gpu_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [shared_seq_cst_volatile_gpu_param_2];
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.shared.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [shared_seq_cst_volatile_gpu_param_3];
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.shared.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [shared_seq_cst_volatile_gpu_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.shared.b16 [%rd2], %rs4;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.shared.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.shared.b32 [%rd3], %r2;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.shared.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.shared.b64 [%rd4], %rd7;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.shared.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.shared.b32 [%rd5], %r4;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.shared.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.shared.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("device") seq_cst, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("device") seq_cst, align 1
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("device") seq_cst, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("device") seq_cst, align 2
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("device") seq_cst, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("device") seq_cst, align 4
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("device") seq_cst, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("device") seq_cst, align 8
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("device") seq_cst, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("device") seq_cst, align 4
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("device") seq_cst, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("device") seq_cst, align 8
 
   ret void
@@ -2608,52 +3711,71 @@ define void @shared_seq_cst_volatile_gpu(ptr addrspace(3) %a, ptr addrspace(3) %
 
 ; CHECK-LABEL: shared_seq_cst_cta
 define void @shared_seq_cst_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: fence.sc.cta
-  ; CHECK: ld.acquire.cta.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: shared_seq_cst_cta(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_seq_cst_cta_param_0];
+; CHECK-NEXT:    fence.sc.cta;
+; CHECK-NEXT:    ld.acquire.cta.shared.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [shared_seq_cst_cta_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [shared_seq_cst_cta_param_2];
+; CHECK-NEXT:    fence.sc.cta;
+; CHECK-NEXT:    st.release.cta.shared.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [shared_seq_cst_cta_param_3];
+; CHECK-NEXT:    fence.sc.cta;
+; CHECK-NEXT:    ld.acquire.cta.shared.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [shared_seq_cst_cta_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    fence.sc.cta;
+; CHECK-NEXT:    st.release.cta.shared.b16 [%rd2], %rs4;
+; CHECK-NEXT:    fence.sc.cta;
+; CHECK-NEXT:    ld.acquire.cta.shared.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    fence.sc.cta;
+; CHECK-NEXT:    st.release.cta.shared.b32 [%rd3], %r2;
+; CHECK-NEXT:    fence.sc.cta;
+; CHECK-NEXT:    ld.acquire.cta.shared.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    fence.sc.cta;
+; CHECK-NEXT:    st.release.cta.shared.b64 [%rd4], %rd7;
+; CHECK-NEXT:    fence.sc.cta;
+; CHECK-NEXT:    ld.acquire.cta.shared.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    fence.sc.cta;
+; CHECK-NEXT:    st.release.cta.shared.b32 [%rd5], %r4;
+; CHECK-NEXT:    fence.sc.cta;
+; CHECK-NEXT:    ld.acquire.cta.shared.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    fence.sc.cta;
+; CHECK-NEXT:    st.release.cta.shared.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(3) %a syncscope("block") seq_cst, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: fence.sc.cta
-  ; CHECK: st.release.cta.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(3) %a syncscope("block") seq_cst, align 1
 
-  ; CHECK: fence.sc.cta
-  ; CHECK: ld.acquire.cta.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(3) %b syncscope("block") seq_cst, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: fence.sc.cta
-  ; CHECK: st.release.cta.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(3) %b syncscope("block") seq_cst, align 2
 
-  ; CHECK: fence.sc.cta
-  ; CHECK: ld.acquire.cta.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(3) %c syncscope("block") seq_cst, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: fence.sc.cta
-  ; CHECK: st.release.cta.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(3) %c syncscope("block") seq_cst, align 4
 
-  ; CHECK: fence.sc.cta
-  ; CHECK: ld.acquire.cta.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(3) %d syncscope("block") seq_cst, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: fence.sc.cta
-  ; CHECK: st.release.cta.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(3) %d syncscope("block") seq_cst, align 8
 
-  ; CHECK: fence.sc.cta
-  ; CHECK: ld.acquire.cta.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(3) %e syncscope("block") seq_cst, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: fence.sc.cta
-  ; CHECK: st.release.cta.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(3) %e syncscope("block") seq_cst, align 4
 
-  ; CHECK: fence.sc.cta
-  ; CHECK: ld.acquire.cta.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(3) %e syncscope("block") seq_cst, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: fence.sc.cta
-  ; CHECK: st.release.cta.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(3) %e syncscope("block") seq_cst, align 8
 
   ret void
@@ -2661,52 +3783,71 @@ define void @shared_seq_cst_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr ad
 
 ; CHECK-LABEL: shared_seq_cst_volatile_cta
 define void @shared_seq_cst_volatile_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: shared_seq_cst_volatile_cta(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_seq_cst_volatile_cta_param_0];
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.shared.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [shared_seq_cst_volatile_cta_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [shared_seq_cst_volatile_cta_param_2];
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.shared.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [shared_seq_cst_volatile_cta_param_3];
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.shared.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [shared_seq_cst_volatile_cta_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.shared.b16 [%rd2], %rs4;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.shared.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.shared.b32 [%rd3], %r2;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.shared.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.shared.b64 [%rd4], %rd7;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.shared.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.shared.b32 [%rd5], %r4;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.shared.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.shared.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("block") seq_cst, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("block") seq_cst, align 1
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("block") seq_cst, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("block") seq_cst, align 2
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("block") seq_cst, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("block") seq_cst, align 4
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("block") seq_cst, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("block") seq_cst, align 8
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("block") seq_cst, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("block") seq_cst, align 4
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("block") seq_cst, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("block") seq_cst, align 8
 
   ret void
@@ -2716,40 +3857,59 @@ define void @shared_seq_cst_volatile_cta(ptr addrspace(3) %a, ptr addrspace(3) %
 
 ; CHECK-LABEL: local_unordered_gpu
 define void @local_unordered_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: local_unordered_gpu(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_unordered_gpu_param_0];
+; CHECK-NEXT:    ld.local.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [local_unordered_gpu_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [local_unordered_gpu_param_2];
+; CHECK-NEXT:    st.local.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [local_unordered_gpu_param_3];
+; CHECK-NEXT:    ld.local.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [local_unordered_gpu_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.local.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.local.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.local.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.local.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.local.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.local.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.local.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.local.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.local.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(5) %a syncscope("device") unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(5) %a syncscope("device") unordered, align 1
 
-  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(5) %b syncscope("device") unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(5) %b syncscope("device") unordered, align 2
 
-  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(5) %c syncscope("device") unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(5) %c syncscope("device") unordered, align 4
 
-  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(5) %d syncscope("device") unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(5) %d syncscope("device") unordered, align 8
 
-  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(5) %e syncscope("device") unordered, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(5) %e syncscope("device") unordered, align 4
 
-  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(5) %e syncscope("device") unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(5) %e syncscope("device") unordered, align 8
 
   ret void
@@ -2757,40 +3917,59 @@ define void @local_unordered_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr a
 
 ; CHECK-LABEL: local_unordered_volatile_gpu
 define void @local_unordered_volatile_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: local_unordered_volatile_gpu(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_unordered_volatile_gpu_param_0];
+; CHECK-NEXT:    ld.local.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [local_unordered_volatile_gpu_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [local_unordered_volatile_gpu_param_2];
+; CHECK-NEXT:    st.local.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [local_unordered_volatile_gpu_param_3];
+; CHECK-NEXT:    ld.local.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [local_unordered_volatile_gpu_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.local.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.local.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.local.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.local.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.local.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.local.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.local.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.local.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.local.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("device") unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("device") unordered, align 1
 
-  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("device") unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("device") unordered, align 2
 
-  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("device") unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("device") unordered, align 4
 
-  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("device") unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("device") unordered, align 8
 
-  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("device") unordered, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("device") unordered, align 4
 
-  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("device") unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("device") unordered, align 8
 
   ret void
@@ -2798,40 +3977,59 @@ define void @local_unordered_volatile_gpu(ptr addrspace(5) %a, ptr addrspace(5)
 
 ; CHECK-LABEL: local_unordered_cta
 define void @local_unordered_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: local_unordered_cta(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_unordered_cta_param_0];
+; CHECK-NEXT:    ld.local.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [local_unordered_cta_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [local_unordered_cta_param_2];
+; CHECK-NEXT:    st.local.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [local_unordered_cta_param_3];
+; CHECK-NEXT:    ld.local.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [local_unordered_cta_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.local.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.local.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.local.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.local.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.local.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.local.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.local.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.local.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.local.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(5) %a syncscope("block") unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(5) %a syncscope("block") unordered, align 1
 
-  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(5) %b syncscope("block") unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(5) %b syncscope("block") unordered, align 2
 
-  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(5) %c syncscope("block") unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(5) %c syncscope("block") unordered, align 4
 
-  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(5) %d syncscope("block") unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(5) %d syncscope("block") unordered, align 8
 
-  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(5) %e syncscope("block") unordered, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(5) %e syncscope("block") unordered, align 4
 
-  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(5) %e syncscope("block") unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(5) %e syncscope("block") unordered, align 8
 
   ret void
@@ -2839,40 +4037,59 @@ define void @local_unordered_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr a
 
 ; CHECK-LABEL: local_unordered_volatile_cta
 define void @local_unordered_volatile_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: local_unordered_volatile_cta(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_unordered_volatile_cta_param_0];
+; CHECK-NEXT:    ld.local.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [local_unordered_volatile_cta_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [local_unordered_volatile_cta_param_2];
+; CHECK-NEXT:    st.local.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [local_unordered_volatile_cta_param_3];
+; CHECK-NEXT:    ld.local.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [local_unordered_volatile_cta_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.local.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.local.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.local.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.local.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.local.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.local.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.local.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.local.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.local.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("block") unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("block") unordered, align 1
 
-  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("block") unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("block") unordered, align 2
 
-  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("block") unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("block") unordered, align 4
 
-  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("block") unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("block") unordered, align 8
 
-  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("block") unordered, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("block") unordered, align 4
 
-  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("block") unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("block") unordered, align 8
 
   ret void
@@ -2880,40 +4097,59 @@ define void @local_unordered_volatile_cta(ptr addrspace(5) %a, ptr addrspace(5)
 
 ; CHECK-LABEL: local_monotonic_gpu
 define void @local_monotonic_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: local_monotonic_gpu(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_monotonic_gpu_param_0];
+; CHECK-NEXT:    ld.local.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [local_monotonic_gpu_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [local_monotonic_gpu_param_2];
+; CHECK-NEXT:    st.local.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [local_monotonic_gpu_param_3];
+; CHECK-NEXT:    ld.local.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [local_monotonic_gpu_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.local.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.local.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.local.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.local.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.local.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.local.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.local.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.local.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.local.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(5) %a syncscope("device") monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(5) %a syncscope("device") monotonic, align 1
 
-  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(5) %b syncscope("device") monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(5) %b syncscope("device") monotonic, align 2
 
-  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(5) %c syncscope("device") monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(5) %c syncscope("device") monotonic, align 4
 
-  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(5) %d syncscope("device") monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(5) %d syncscope("device") monotonic, align 8
 
-  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(5) %e syncscope("device") monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(5) %e syncscope("device") monotonic, align 4
 
-  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(5) %e syncscope("device") monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(5) %e syncscope("device") monotonic, align 8
 
   ret void
@@ -2921,40 +4157,59 @@ define void @local_monotonic_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr a
 
 ; CHECK-LABEL: local_monotonic_volatile_gpu
 define void @local_monotonic_volatile_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: local_monotonic_volatile_gpu(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_monotonic_volatile_gpu_param_0];
+; CHECK-NEXT:    ld.local.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [local_monotonic_volatile_gpu_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [local_monotonic_volatile_gpu_param_2];
+; CHECK-NEXT:    st.local.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [local_monotonic_volatile_gpu_param_3];
+; CHECK-NEXT:    ld.local.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [local_monotonic_volatile_gpu_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.local.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.local.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.local.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.local.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.local.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.local.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.local.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.local.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.local.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("device") monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("device") monotonic, align 1
 
-  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("device") monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("device") monotonic, align 2
 
-  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("device") monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("device") monotonic, align 4
 
-  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("device") monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("device") monotonic, align 8
 
-  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("device") monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("device") monotonic, align 4
 
-  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("device") monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("device") monotonic, align 8
 
   ret void
@@ -2962,40 +4217,59 @@ define void @local_monotonic_volatile_gpu(ptr addrspace(5) %a, ptr addrspace(5)
 
 ; CHECK-LABEL: local_monotonic_cta
 define void @local_monotonic_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: local_monotonic_cta(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_monotonic_cta_param_0];
+; CHECK-NEXT:    ld.local.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [local_monotonic_cta_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [local_monotonic_cta_param_2];
+; CHECK-NEXT:    st.local.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [local_monotonic_cta_param_3];
+; CHECK-NEXT:    ld.local.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [local_monotonic_cta_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.local.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.local.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.local.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.local.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.local.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.local.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.local.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.local.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.local.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(5) %a syncscope("block") monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(5) %a syncscope("block") monotonic, align 1
 
-  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(5) %b syncscope("block") monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(5) %b syncscope("block") monotonic, align 2
 
-  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(5) %c syncscope("block") monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(5) %c syncscope("block") monotonic, align 4
 
-  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(5) %d syncscope("block") monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(5) %d syncscope("block") monotonic, align 8
 
-  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(5) %e syncscope("block") monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(5) %e syncscope("block") monotonic, align 4
 
-  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(5) %e syncscope("block") monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(5) %e syncscope("block") monotonic, align 8
 
   ret void
@@ -3003,40 +4277,59 @@ define void @local_monotonic_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr a
 
 ; CHECK-LABEL: local_monotonic_volatile_cta
 define void @local_monotonic_volatile_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: local_monotonic_volatile_cta(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_monotonic_volatile_cta_param_0];
+; CHECK-NEXT:    ld.local.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [local_monotonic_volatile_cta_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [local_monotonic_volatile_cta_param_2];
+; CHECK-NEXT:    st.local.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [local_monotonic_volatile_cta_param_3];
+; CHECK-NEXT:    ld.local.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [local_monotonic_volatile_cta_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.local.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.local.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.local.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.local.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.local.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.local.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.local.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.local.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.local.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("block") monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("block") monotonic, align 1
 
-  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("block") monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("block") monotonic, align 2
 
-  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("block") monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("block") monotonic, align 4
 
-  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("block") monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("block") monotonic, align 8
 
-  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("block") monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("block") monotonic, align 4
 
-  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("block") monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("block") monotonic, align 8
 
   ret void
@@ -3044,40 +4337,59 @@ define void @local_monotonic_volatile_cta(ptr addrspace(5) %a, ptr addrspace(5)
 
 ; CHECK-LABEL: local_acq_rel_sys
 define void @local_acq_rel_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: local_acq_rel_sys(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_acq_rel_sys_param_0];
+; CHECK-NEXT:    ld.local.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [local_acq_rel_sys_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [local_acq_rel_sys_param_2];
+; CHECK-NEXT:    st.local.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [local_acq_rel_sys_param_3];
+; CHECK-NEXT:    ld.local.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [local_acq_rel_sys_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.local.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.local.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.local.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.local.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.local.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.local.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.local.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.local.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.local.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(5) %a acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(5) %a release, align 1
 
-  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(5) %b acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(5) %b release, align 2
 
-  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(5) %c acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(5) %c release, align 4
 
-  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(5) %d acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(5) %d release, align 8
 
-  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(5) %e acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(5) %e release, align 4
 
-  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(5) %e acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(5) %e release, align 8
 
   ret void
@@ -3085,40 +4397,59 @@ define void @local_acq_rel_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr add
 
 ; CHECK-LABEL: local_acq_rel_volatile_sys
 define void @local_acq_rel_volatile_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: local_acq_rel_volatile_sys(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_acq_rel_volatile_sys_param_0];
+; CHECK-NEXT:    ld.local.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [local_acq_rel_volatile_sys_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [local_acq_rel_volatile_sys_param_2];
+; CHECK-NEXT:    st.local.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [local_acq_rel_volatile_sys_param_3];
+; CHECK-NEXT:    ld.local.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [local_acq_rel_volatile_sys_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.local.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.local.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.local.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.local.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.local.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.local.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.local.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.local.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.local.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(5) %a acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(5) %a release, align 1
 
-  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(5) %b acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(5) %b release, align 2
 
-  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(5) %c acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(5) %c release, align 4
 
-  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(5) %d acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(5) %d release, align 8
 
-  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(5) %e acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(5) %e release, align 4
 
-  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(5) %e acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(5) %e release, align 8
 
   ret void
@@ -3126,40 +4457,59 @@ define void @local_acq_rel_volatile_sys(ptr addrspace(5) %a, ptr addrspace(5) %b
 
 ; CHECK-LABEL: local_acq_rel_gpu
 define void @local_acq_rel_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: local_acq_rel_gpu(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_acq_rel_gpu_param_0];
+; CHECK-NEXT:    ld.local.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [local_acq_rel_gpu_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [local_acq_rel_gpu_param_2];
+; CHECK-NEXT:    st.local.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [local_acq_rel_gpu_param_3];
+; CHECK-NEXT:    ld.local.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [local_acq_rel_gpu_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.local.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.local.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.local.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.local.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.local.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.local.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.local.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.local.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.local.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(5) %a syncscope("device") acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(5) %a syncscope("device") release, align 1
 
-  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(5) %b syncscope("device") acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(5) %b syncscope("device") release, align 2
 
-  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(5) %c syncscope("device") acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(5) %c syncscope("device") release, align 4
 
-  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(5) %d syncscope("device") acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(5) %d syncscope("device") release, align 8
 
-  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(5) %e syncscope("device") acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(5) %e syncscope("device") release, align 4
 
-  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(5) %e syncscope("device") acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(5) %e syncscope("device") release, align 8
 
   ret void
@@ -3167,40 +4517,59 @@ define void @local_acq_rel_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr add
 
 ; CHECK-LABEL: local_acq_rel_volatile_gpu
 define void @local_acq_rel_volatile_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: local_acq_rel_volatile_gpu(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_acq_rel_volatile_gpu_param_0];
+; CHECK-NEXT:    ld.local.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [local_acq_rel_volatile_gpu_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [local_acq_rel_volatile_gpu_param_2];
+; CHECK-NEXT:    st.local.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [local_acq_rel_volatile_gpu_param_3];
+; CHECK-NEXT:    ld.local.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [local_acq_rel_volatile_gpu_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.local.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.local.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.local.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.local.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.local.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.local.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.local.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.local.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.local.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("device") acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("device") release, align 1
 
-  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("device") acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("device") release, align 2
 
-  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("device") acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("device") release, align 4
 
-  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("device") acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("device") release, align 8
 
-  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("device") acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("device") release, align 4
 
-  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("device") acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("device") release, align 8
 
   ret void
@@ -3208,40 +4577,59 @@ define void @local_acq_rel_volatile_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b
 
 ; CHECK-LABEL: local_acq_rel_cta
 define void @local_acq_rel_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: local_acq_rel_cta(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_acq_rel_cta_param_0];
+; CHECK-NEXT:    ld.local.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [local_acq_rel_cta_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [local_acq_rel_cta_param_2];
+; CHECK-NEXT:    st.local.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [local_acq_rel_cta_param_3];
+; CHECK-NEXT:    ld.local.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [local_acq_rel_cta_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.local.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.local.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.local.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.local.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.local.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.local.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.local.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.local.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.local.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(5) %a syncscope("block") acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(5) %a syncscope("block") release, align 1
 
-  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(5) %b syncscope("block") acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(5) %b syncscope("block") release, align 2
 
-  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(5) %c syncscope("block") acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(5) %c syncscope("block") release, align 4
 
-  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(5) %d syncscope("block") acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(5) %d syncscope("block") release, align 8
 
-  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(5) %e syncscope("block") acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(5) %e syncscope("block") release, align 4
 
-  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(5) %e syncscope("block") acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(5) %e syncscope("block") release, align 8
 
   ret void
@@ -3249,40 +4637,59 @@ define void @local_acq_rel_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr add
 
 ; CHECK-LABEL: local_acq_rel_volatile_cta
 define void @local_acq_rel_volatile_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: local_acq_rel_volatile_cta(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_acq_rel_volatile_cta_param_0];
+; CHECK-NEXT:    ld.local.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [local_acq_rel_volatile_cta_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [local_acq_rel_volatile_cta_param_2];
+; CHECK-NEXT:    st.local.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [local_acq_rel_volatile_cta_param_3];
+; CHECK-NEXT:    ld.local.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [local_acq_rel_volatile_cta_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.local.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.local.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.local.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.local.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.local.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.local.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.local.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.local.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.local.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("block") acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("block") release, align 1
 
-  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("block") acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("block") release, align 2
 
-  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("block") acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("block") release, align 4
 
-  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("block") acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("block") release, align 8
 
-  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("block") acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("block") release, align 4
 
-  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("block") acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("block") release, align 8
 
   ret void
@@ -3290,40 +4697,59 @@ define void @local_acq_rel_volatile_cta(ptr addrspace(5) %a, ptr addrspace(5) %b
 
 ; CHECK-LABEL: local_seq_cst_sys
 define void @local_seq_cst_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: local_seq_cst_sys(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_seq_cst_sys_param_0];
+; CHECK-NEXT:    ld.local.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [local_seq_cst_sys_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [local_seq_cst_sys_param_2];
+; CHECK-NEXT:    st.local.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [local_seq_cst_sys_param_3];
+; CHECK-NEXT:    ld.local.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [local_seq_cst_sys_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.local.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.local.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.local.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.local.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.local.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.local.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.local.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.local.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.local.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(5) %a seq_cst, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(5) %a seq_cst, align 1
 
-  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(5) %b seq_cst, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(5) %b seq_cst, align 2
 
-  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(5) %c seq_cst, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(5) %c seq_cst, align 4
 
-  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(5) %d seq_cst, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(5) %d seq_cst, align 8
 
-  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(5) %e seq_cst, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(5) %e seq_cst, align 4
 
-  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(5) %e seq_cst, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(5) %e seq_cst, align 8
 
   ret void
@@ -3331,40 +4757,59 @@ define void @local_seq_cst_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr add
 
 ; CHECK-LABEL: local_seq_cst_volatile_sys
 define void @local_seq_cst_volatile_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: local_seq_cst_volatile_sys(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_seq_cst_volatile_sys_param_0];
+; CHECK-NEXT:    ld.local.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [local_seq_cst_volatile_sys_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [local_seq_cst_volatile_sys_param_2];
+; CHECK-NEXT:    st.local.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [local_seq_cst_volatile_sys_param_3];
+; CHECK-NEXT:    ld.local.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [local_seq_cst_volatile_sys_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.local.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.local.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.local.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.local.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.local.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.local.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.local.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.local.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.local.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(5) %a seq_cst, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(5) %a seq_cst, align 1
 
-  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(5) %b seq_cst, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(5) %b seq_cst, align 2
 
-  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(5) %c seq_cst, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(5) %c seq_cst, align 4
 
-  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(5) %d seq_cst, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(5) %d seq_cst, align 8
 
-  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(5) %e seq_cst, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(5) %e seq_cst, align 4
 
-  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(5) %e seq_cst, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(5) %e seq_cst, align 8
 
   ret void
@@ -3372,40 +4817,59 @@ define void @local_seq_cst_volatile_sys(ptr addrspace(5) %a, ptr addrspace(5) %b
 
 ; CHECK-LABEL: local_seq_cst_gpu
 define void @local_seq_cst_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: local_seq_cst_gpu(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_seq_cst_gpu_param_0];
+; CHECK-NEXT:    ld.local.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [local_seq_cst_gpu_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [local_seq_cst_gpu_param_2];
+; CHECK-NEXT:    st.local.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [local_seq_cst_gpu_param_3];
+; CHECK-NEXT:    ld.local.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [local_seq_cst_gpu_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.local.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.local.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.local.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.local.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.local.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.local.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.local.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.local.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.local.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(5) %a syncscope("device") seq_cst, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(5) %a syncscope("device") seq_cst, align 1
 
-  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(5) %b syncscope("device") seq_cst, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(5) %b syncscope("device") seq_cst, align 2
 
-  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(5) %c syncscope("device") seq_cst, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(5) %c syncscope("device") seq_cst, align 4
 
-  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(5) %d syncscope("device") seq_cst, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(5) %d syncscope("device") seq_cst, align 8
 
-  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(5) %e syncscope("device") seq_cst, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(5) %e syncscope("device") seq_cst, align 4
 
-  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(5) %e syncscope("device") seq_cst, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(5) %e syncscope("device") seq_cst, align 8
 
   ret void
@@ -3413,40 +4877,59 @@ define void @local_seq_cst_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr add
 
 ; CHECK-LABEL: local_seq_cst_volatile_gpu
 define void @local_seq_cst_volatile_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: local_seq_cst_volatile_gpu(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_seq_cst_volatile_gpu_param_0];
+; CHECK-NEXT:    ld.local.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [local_seq_cst_volatile_gpu_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [local_seq_cst_volatile_gpu_param_2];
+; CHECK-NEXT:    st.local.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [local_seq_cst_volatile_gpu_param_3];
+; CHECK-NEXT:    ld.local.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [local_seq_cst_volatile_gpu_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.local.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.local.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.local.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.local.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.local.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.local.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.local.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.local.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.local.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("device") seq_cst, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("device") seq_cst, align 1
 
-  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("device") seq_cst, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("device") seq_cst, align 2
 
-  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("device") seq_cst, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("device") seq_cst, align 4
 
-  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("device") seq_cst, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("device") seq_cst, align 8
 
-  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("device") seq_cst, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("device") seq_cst, align 4
 
-  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("device") seq_cst, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("device") seq_cst, align 8
 
   ret void
@@ -3454,40 +4937,59 @@ define void @local_seq_cst_volatile_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b
 
 ; CHECK-LABEL: local_seq_cst_cta
 define void @local_seq_cst_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: local_seq_cst_cta(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_seq_cst_cta_param_0];
+; CHECK-NEXT:    ld.local.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [local_seq_cst_cta_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [local_seq_cst_cta_param_2];
+; CHECK-NEXT:    st.local.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [local_seq_cst_cta_param_3];
+; CHECK-NEXT:    ld.local.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [local_seq_cst_cta_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.local.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.local.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.local.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.local.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.local.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.local.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.local.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.local.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.local.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(5) %a syncscope("block") seq_cst, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(5) %a syncscope("block") seq_cst, align 1
 
-  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(5) %b syncscope("block") seq_cst, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(5) %b syncscope("block") seq_cst, align 2
 
-  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(5) %c syncscope("block") seq_cst, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(5) %c syncscope("block") seq_cst, align 4
 
-  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(5) %d syncscope("block") seq_cst, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(5) %d syncscope("block") seq_cst, align 8
 
-  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(5) %e syncscope("block") seq_cst, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(5) %e syncscope("block") seq_cst, align 4
 
-  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(5) %e syncscope("block") seq_cst, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(5) %e syncscope("block") seq_cst, align 8
 
   ret void
@@ -3495,40 +4997,59 @@ define void @local_seq_cst_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr add
 
 ; CHECK-LABEL: local_seq_cst_volatile_cta
 define void @local_seq_cst_volatile_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: local_seq_cst_volatile_cta(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_seq_cst_volatile_cta_param_0];
+; CHECK-NEXT:    ld.local.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [local_seq_cst_volatile_cta_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [local_seq_cst_volatile_cta_param_2];
+; CHECK-NEXT:    st.local.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [local_seq_cst_volatile_cta_param_3];
+; CHECK-NEXT:    ld.local.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [local_seq_cst_volatile_cta_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.local.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.local.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.local.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.local.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.local.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.local.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.local.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.local.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.local.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("block") seq_cst, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("block") seq_cst, align 1
 
-  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("block") seq_cst, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("block") seq_cst, align 2
 
-  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("block") seq_cst, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("block") seq_cst, align 4
 
-  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("block") seq_cst, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("block") seq_cst, align 8
 
-  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("block") seq_cst, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("block") seq_cst, align 4
 
-  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("block") seq_cst, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("block") seq_cst, align 8
 
   ret void
diff --git a/llvm/test/CodeGen/NVPTX/load-store-sm-90.ll b/llvm/test/CodeGen/NVPTX/load-store-sm-90.ll
index ae559f50d4987..ed170e92917f5 100644
--- a/llvm/test/CodeGen/NVPTX/load-store-sm-90.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store-sm-90.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck %s
 ; RUN: %if ptxas-12.2 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %}
 
@@ -34,40 +35,59 @@
 
 ; CHECK-LABEL: generic_unordered_cluster
 define void @generic_unordered_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: ld.relaxed.cluster.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: generic_unordered_cluster(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_unordered_cluster_param_0];
+; CHECK-NEXT:    ld.relaxed.cluster.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [generic_unordered_cluster_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [generic_unordered_cluster_param_2];
+; CHECK-NEXT:    st.relaxed.cluster.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [generic_unordered_cluster_param_3];
+; CHECK-NEXT:    ld.relaxed.cluster.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [generic_unordered_cluster_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.relaxed.cluster.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.relaxed.cluster.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.relaxed.cluster.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.relaxed.cluster.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.relaxed.cluster.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.relaxed.cluster.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.relaxed.cluster.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.relaxed.cluster.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.relaxed.cluster.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr %a syncscope("cluster") unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.relaxed.cluster.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr %a syncscope("cluster") unordered, align 1
 
-  ; CHECK: ld.relaxed.cluster.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr %b syncscope("cluster") unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.relaxed.cluster.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr %b syncscope("cluster") unordered, align 2
 
-  ; CHECK: ld.relaxed.cluster.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr %c syncscope("cluster") unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.relaxed.cluster.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr %c syncscope("cluster") unordered, align 4
 
-  ; CHECK: ld.relaxed.cluster.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr %d syncscope("cluster") unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.relaxed.cluster.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr %d syncscope("cluster") unordered, align 8
 
-  ; CHECK: ld.relaxed.cluster.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr %e syncscope("cluster") unordered, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.relaxed.cluster.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr %e syncscope("cluster") unordered, align 4
 
-  ; CHECK: ld.relaxed.cluster.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr %e syncscope("cluster") unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.relaxed.cluster.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr %e syncscope("cluster") unordered, align 8
 
   ret void
@@ -75,40 +95,59 @@ define void @generic_unordered_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) l
 
 ; CHECK-LABEL: generic_unordered_volatile_cluster
 define void @generic_unordered_volatile_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: ld.volatile.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: generic_unordered_volatile_cluster(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_unordered_volatile_cluster_param_0];
+; CHECK-NEXT:    ld.volatile.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [generic_unordered_volatile_cluster_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [generic_unordered_volatile_cluster_param_2];
+; CHECK-NEXT:    st.volatile.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [generic_unordered_volatile_cluster_param_3];
+; CHECK-NEXT:    ld.volatile.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [generic_unordered_volatile_cluster_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.volatile.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.volatile.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.volatile.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.volatile.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.volatile.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.volatile.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.volatile.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.volatile.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.volatile.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr %a syncscope("cluster") unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.volatile.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr %a syncscope("cluster") unordered, align 1
 
-  ; CHECK: ld.volatile.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr %b syncscope("cluster") unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.volatile.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr %b syncscope("cluster") unordered, align 2
 
-  ; CHECK: ld.volatile.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr %c syncscope("cluster") unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.volatile.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr %c syncscope("cluster") unordered, align 4
 
-  ; CHECK: ld.volatile.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr %d syncscope("cluster") unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.volatile.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr %d syncscope("cluster") unordered, align 8
 
-  ; CHECK: ld.volatile.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr %e syncscope("cluster") unordered, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.volatile.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr %e syncscope("cluster") unordered, align 4
 
-  ; CHECK: ld.volatile.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr %e syncscope("cluster") unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.volatile.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr %e syncscope("cluster") unordered, align 8
 
   ret void
@@ -116,40 +155,59 @@ define void @generic_unordered_volatile_cluster(ptr %a, ptr %b, ptr %c, ptr %d,
 
 ; CHECK-LABEL: generic_monotonic_cluster
 define void @generic_monotonic_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: ld.relaxed.cluster.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: generic_monotonic_cluster(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_monotonic_cluster_param_0];
+; CHECK-NEXT:    ld.relaxed.cluster.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [generic_monotonic_cluster_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [generic_monotonic_cluster_param_2];
+; CHECK-NEXT:    st.relaxed.cluster.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [generic_monotonic_cluster_param_3];
+; CHECK-NEXT:    ld.relaxed.cluster.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [generic_monotonic_cluster_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.relaxed.cluster.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.relaxed.cluster.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.relaxed.cluster.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.relaxed.cluster.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.relaxed.cluster.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.relaxed.cluster.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.relaxed.cluster.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.relaxed.cluster.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.relaxed.cluster.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr %a syncscope("cluster") monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.relaxed.cluster.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr %a syncscope("cluster") monotonic, align 1
 
-  ; CHECK: ld.relaxed.cluster.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr %b syncscope("cluster") monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.relaxed.cluster.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr %b syncscope("cluster") monotonic, align 2
 
-  ; CHECK: ld.relaxed.cluster.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr %c syncscope("cluster") monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.relaxed.cluster.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr %c syncscope("cluster") monotonic, align 4
 
-  ; CHECK: ld.relaxed.cluster.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr %d syncscope("cluster") monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.relaxed.cluster.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr %d syncscope("cluster") monotonic, align 8
 
-  ; CHECK: ld.relaxed.cluster.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr %e syncscope("cluster") monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.relaxed.cluster.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr %e syncscope("cluster") monotonic, align 4
 
-  ; CHECK: ld.relaxed.cluster.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr %e syncscope("cluster") monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.relaxed.cluster.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr %e syncscope("cluster") monotonic, align 8
 
   ret void
@@ -157,40 +215,59 @@ define void @generic_monotonic_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) l
 
 ; CHECK-LABEL: generic_monotonic_volatile_cluster
 define void @generic_monotonic_volatile_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: ld.volatile.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: generic_monotonic_volatile_cluster(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_monotonic_volatile_cluster_param_0];
+; CHECK-NEXT:    ld.volatile.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [generic_monotonic_volatile_cluster_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [generic_monotonic_volatile_cluster_param_2];
+; CHECK-NEXT:    st.volatile.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [generic_monotonic_volatile_cluster_param_3];
+; CHECK-NEXT:    ld.volatile.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [generic_monotonic_volatile_cluster_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.volatile.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.volatile.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.volatile.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.volatile.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.volatile.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.volatile.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.volatile.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.volatile.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.volatile.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr %a syncscope("cluster") monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.volatile.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr %a syncscope("cluster") monotonic, align 1
 
-  ; CHECK: ld.volatile.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr %b syncscope("cluster") monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.volatile.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr %b syncscope("cluster") monotonic, align 2
 
-  ; CHECK: ld.volatile.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr %c syncscope("cluster") monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.volatile.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr %c syncscope("cluster") monotonic, align 4
 
-  ; CHECK: ld.volatile.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr %d syncscope("cluster") monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.volatile.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr %d syncscope("cluster") monotonic, align 8
 
-  ; CHECK: ld.volatile.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr %e syncscope("cluster") monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.volatile.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr %e syncscope("cluster") monotonic, align 4
 
-  ; CHECK: ld.volatile.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr %e syncscope("cluster") monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.volatile.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr %e syncscope("cluster") monotonic, align 8
 
   ret void
@@ -198,40 +275,59 @@ define void @generic_monotonic_volatile_cluster(ptr %a, ptr %b, ptr %c, ptr %d,
 
 ; CHECK-LABEL: generic_acq_rel_cluster
 define void @generic_acq_rel_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.cluster.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: generic_acq_rel_cluster(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_acq_rel_cluster_param_0];
+; CHECK-NEXT:    ld.acquire.cluster.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [generic_acq_rel_cluster_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [generic_acq_rel_cluster_param_2];
+; CHECK-NEXT:    st.release.cluster.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [generic_acq_rel_cluster_param_3];
+; CHECK-NEXT:    ld.acquire.cluster.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [generic_acq_rel_cluster_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.release.cluster.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.acquire.cluster.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.release.cluster.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.acquire.cluster.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.release.cluster.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.acquire.cluster.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.release.cluster.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.acquire.cluster.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.release.cluster.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr %a syncscope("cluster") acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.cluster.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr %a syncscope("cluster") release, align 1
 
-  ; CHECK: ld.acquire.cluster.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr %b syncscope("cluster") acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.cluster.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr %b syncscope("cluster") release, align 2
 
-  ; CHECK: ld.acquire.cluster.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr %c syncscope("cluster") acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.cluster.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr %c syncscope("cluster") release, align 4
 
-  ; CHECK: ld.acquire.cluster.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr %d syncscope("cluster") acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.cluster.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr %d syncscope("cluster") release, align 8
 
-  ; CHECK: ld.acquire.cluster.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr %e syncscope("cluster") acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.release.cluster.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr %e syncscope("cluster") release, align 4
 
-  ; CHECK: ld.acquire.cluster.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr %e syncscope("cluster") acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.cluster.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr %e syncscope("cluster") release, align 8
 
   ret void
@@ -239,40 +335,59 @@ define void @generic_acq_rel_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) loc
 
 ; CHECK-LABEL: generic_acq_rel_volatile_cluster
 define void @generic_acq_rel_volatile_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.sys.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: generic_acq_rel_volatile_cluster(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_acq_rel_volatile_cluster_param_0];
+; CHECK-NEXT:    ld.acquire.sys.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [generic_acq_rel_volatile_cluster_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [generic_acq_rel_volatile_cluster_param_2];
+; CHECK-NEXT:    st.release.sys.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [generic_acq_rel_volatile_cluster_param_3];
+; CHECK-NEXT:    ld.acquire.sys.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [generic_acq_rel_volatile_cluster_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.release.sys.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.acquire.sys.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.release.sys.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.acquire.sys.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.release.sys.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.acquire.sys.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.release.sys.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.acquire.sys.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.release.sys.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr %a syncscope("cluster") acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.sys.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr %a syncscope("cluster") release, align 1
 
-  ; CHECK: ld.acquire.sys.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr %b syncscope("cluster") acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.sys.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr %b syncscope("cluster") release, align 2
 
-  ; CHECK: ld.acquire.sys.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr %c syncscope("cluster") acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr %c syncscope("cluster") release, align 4
 
-  ; CHECK: ld.acquire.sys.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr %d syncscope("cluster") acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr %d syncscope("cluster") release, align 8
 
-  ; CHECK: ld.acquire.sys.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr %e syncscope("cluster") acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr %e syncscope("cluster") release, align 4
 
-  ; CHECK: ld.acquire.sys.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr %e syncscope("cluster") acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr %e syncscope("cluster") release, align 8
 
   ret void
@@ -280,52 +395,71 @@ define void @generic_acq_rel_volatile_cluster(ptr %a, ptr %b, ptr %c, ptr %d, pt
 
 ; CHECK-LABEL: generic_sc_cluster
 define void @generic_sc_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: fence.sc.cluster
-  ; CHECK: ld.acquire.cluster.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: generic_sc_cluster(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_sc_cluster_param_0];
+; CHECK-NEXT:    fence.sc.cluster;
+; CHECK-NEXT:    ld.acquire.cluster.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [generic_sc_cluster_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [generic_sc_cluster_param_2];
+; CHECK-NEXT:    fence.sc.cluster;
+; CHECK-NEXT:    st.release.cluster.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [generic_sc_cluster_param_3];
+; CHECK-NEXT:    fence.sc.cluster;
+; CHECK-NEXT:    ld.acquire.cluster.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [generic_sc_cluster_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    fence.sc.cluster;
+; CHECK-NEXT:    st.release.cluster.b16 [%rd2], %rs4;
+; CHECK-NEXT:    fence.sc.cluster;
+; CHECK-NEXT:    ld.acquire.cluster.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    fence.sc.cluster;
+; CHECK-NEXT:    st.release.cluster.b32 [%rd3], %r2;
+; CHECK-NEXT:    fence.sc.cluster;
+; CHECK-NEXT:    ld.acquire.cluster.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    fence.sc.cluster;
+; CHECK-NEXT:    st.release.cluster.b64 [%rd4], %rd7;
+; CHECK-NEXT:    fence.sc.cluster;
+; CHECK-NEXT:    ld.acquire.cluster.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    fence.sc.cluster;
+; CHECK-NEXT:    st.release.cluster.b32 [%rd5], %r4;
+; CHECK-NEXT:    fence.sc.cluster;
+; CHECK-NEXT:    ld.acquire.cluster.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    fence.sc.cluster;
+; CHECK-NEXT:    st.release.cluster.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr %a syncscope("cluster") seq_cst, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: fence.sc.cluster
-  ; CHECK: st.release.cluster.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr %a syncscope("cluster") seq_cst, align 1
 
-  ; CHECK: fence.sc.cluster
-  ; CHECK: ld.acquire.cluster.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr %b syncscope("cluster") seq_cst, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: fence.sc.cluster
-  ; CHECK: st.release.cluster.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr %b syncscope("cluster") seq_cst, align 2
 
-  ; CHECK: fence.sc.cluster
-  ; CHECK: ld.acquire.cluster.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr %c syncscope("cluster") seq_cst, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: fence.sc.cluster
-  ; CHECK: st.release.cluster.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr %c syncscope("cluster") seq_cst, align 4
 
-  ; CHECK: fence.sc.cluster
-  ; CHECK: ld.acquire.cluster.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr %d syncscope("cluster") seq_cst, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: fence.sc.cluster
-  ; CHECK: st.release.cluster.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr %d syncscope("cluster") seq_cst, align 8
 
-  ; CHECK: fence.sc.cluster
-  ; CHECK: ld.acquire.cluster.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr %e syncscope("cluster") seq_cst, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: fence.sc.cluster
-  ; CHECK: st.release.cluster.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr %e syncscope("cluster") seq_cst, align 4
 
-  ; CHECK: fence.sc.cluster
-  ; CHECK: ld.acquire.cluster.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr %e syncscope("cluster") seq_cst, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: fence.sc.cluster
-  ; CHECK: st.release.cluster.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr %e syncscope("cluster") seq_cst, align 8
 
   ret void
@@ -333,52 +467,71 @@ define void @generic_sc_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_un
 
 ; CHECK-LABEL: generic_sc_volatile_cluster
 define void @generic_sc_volatile_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: generic_sc_volatile_cluster(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_sc_volatile_cluster_param_0];
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [generic_sc_volatile_cluster_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [generic_sc_volatile_cluster_param_2];
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [generic_sc_volatile_cluster_param_3];
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [generic_sc_volatile_cluster_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.b16 [%rd2], %rs4;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.b32 [%rd3], %r2;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.b64 [%rd4], %rd7;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.b32 [%rd5], %r4;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr %a syncscope("cluster") seq_cst, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr %a syncscope("cluster") seq_cst, align 1
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr %b syncscope("cluster") seq_cst, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr %b syncscope("cluster") seq_cst, align 2
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr %c syncscope("cluster") seq_cst, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr %c syncscope("cluster") seq_cst, align 4
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr %d syncscope("cluster") seq_cst, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr %d syncscope("cluster") seq_cst, align 8
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr %e syncscope("cluster") seq_cst, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr %e syncscope("cluster") seq_cst, align 4
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr %e syncscope("cluster") seq_cst, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr %e syncscope("cluster") seq_cst, align 8
 
   ret void
@@ -388,40 +541,59 @@ define void @generic_sc_volatile_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e)
 
 ; CHECK-LABEL: global_unordered_cluster
 define void @global_unordered_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: ld.relaxed.cluster.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: global_unordered_cluster(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_unordered_cluster_param_0];
+; CHECK-NEXT:    ld.relaxed.cluster.global.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [global_unordered_cluster_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [global_unordered_cluster_param_2];
+; CHECK-NEXT:    st.relaxed.cluster.global.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [global_unordered_cluster_param_3];
+; CHECK-NEXT:    ld.relaxed.cluster.global.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [global_unordered_cluster_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.relaxed.cluster.global.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.relaxed.cluster.global.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.relaxed.cluster.global.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.relaxed.cluster.global.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.relaxed.cluster.global.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.relaxed.cluster.global.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.relaxed.cluster.global.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.relaxed.cluster.global.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.relaxed.cluster.global.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(1) %a syncscope("cluster") unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.relaxed.cluster.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(1) %a syncscope("cluster") unordered, align 1
 
-  ; CHECK: ld.relaxed.cluster.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(1) %b syncscope("cluster") unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.relaxed.cluster.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(1) %b syncscope("cluster") unordered, align 2
 
-  ; CHECK: ld.relaxed.cluster.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(1) %c syncscope("cluster") unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.relaxed.cluster.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(1) %c syncscope("cluster") unordered, align 4
 
-  ; CHECK: ld.relaxed.cluster.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(1) %d syncscope("cluster") unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.relaxed.cluster.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(1) %d syncscope("cluster") unordered, align 8
 
-  ; CHECK: ld.relaxed.cluster.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(1) %e syncscope("cluster") unordered, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.relaxed.cluster.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(1) %e syncscope("cluster") unordered, align 4
 
-  ; CHECK: ld.relaxed.cluster.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(1) %e syncscope("cluster") unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.relaxed.cluster.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(1) %e syncscope("cluster") unordered, align 8
 
   ret void
@@ -429,40 +601,59 @@ define void @global_unordered_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b,
 
 ; CHECK-LABEL: global_unordered_volatile_cluster
 define void @global_unordered_volatile_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: ld.volatile.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: global_unordered_volatile_cluster(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_unordered_volatile_cluster_param_0];
+; CHECK-NEXT:    ld.volatile.global.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [global_unordered_volatile_cluster_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [global_unordered_volatile_cluster_param_2];
+; CHECK-NEXT:    st.volatile.global.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [global_unordered_volatile_cluster_param_3];
+; CHECK-NEXT:    ld.volatile.global.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [global_unordered_volatile_cluster_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.volatile.global.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.volatile.global.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.volatile.global.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.volatile.global.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.volatile.global.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.volatile.global.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.volatile.global.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.volatile.global.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.volatile.global.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("cluster") unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.volatile.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("cluster") unordered, align 1
 
-  ; CHECK: ld.volatile.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("cluster") unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.volatile.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("cluster") unordered, align 2
 
-  ; CHECK: ld.volatile.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("cluster") unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.volatile.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("cluster") unordered, align 4
 
-  ; CHECK: ld.volatile.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("cluster") unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.volatile.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("cluster") unordered, align 8
 
-  ; CHECK: ld.volatile.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("cluster") unordered, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.volatile.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("cluster") unordered, align 4
 
-  ; CHECK: ld.volatile.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("cluster") unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.volatile.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("cluster") unordered, align 8
 
   ret void
@@ -470,40 +661,59 @@ define void @global_unordered_volatile_cluster(ptr addrspace(1) %a, ptr addrspac
 
 ; CHECK-LABEL: global_monotonic_cluster
 define void @global_monotonic_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: ld.relaxed.cluster.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: global_monotonic_cluster(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_monotonic_cluster_param_0];
+; CHECK-NEXT:    ld.relaxed.cluster.global.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [global_monotonic_cluster_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [global_monotonic_cluster_param_2];
+; CHECK-NEXT:    st.relaxed.cluster.global.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [global_monotonic_cluster_param_3];
+; CHECK-NEXT:    ld.relaxed.cluster.global.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [global_monotonic_cluster_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.relaxed.cluster.global.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.relaxed.cluster.global.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.relaxed.cluster.global.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.relaxed.cluster.global.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.relaxed.cluster.global.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.relaxed.cluster.global.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.relaxed.cluster.global.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.relaxed.cluster.global.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.relaxed.cluster.global.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(1) %a syncscope("cluster") monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.relaxed.cluster.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(1) %a syncscope("cluster") monotonic, align 1
 
-  ; CHECK: ld.relaxed.cluster.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(1) %b syncscope("cluster") monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.relaxed.cluster.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(1) %b syncscope("cluster") monotonic, align 2
 
-  ; CHECK: ld.relaxed.cluster.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(1) %c syncscope("cluster") monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.relaxed.cluster.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(1) %c syncscope("cluster") monotonic, align 4
 
-  ; CHECK: ld.relaxed.cluster.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(1) %d syncscope("cluster") monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.relaxed.cluster.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(1) %d syncscope("cluster") monotonic, align 8
 
-  ; CHECK: ld.relaxed.cluster.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(1) %e syncscope("cluster") monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.relaxed.cluster.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(1) %e syncscope("cluster") monotonic, align 4
 
-  ; CHECK: ld.relaxed.cluster.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(1) %e syncscope("cluster") monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.relaxed.cluster.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(1) %e syncscope("cluster") monotonic, align 8
 
   ret void
@@ -511,40 +721,59 @@ define void @global_monotonic_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b,
 
 ; CHECK-LABEL: global_monotonic_volatile_cluster
 define void @global_monotonic_volatile_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: ld.volatile.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: global_monotonic_volatile_cluster(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_monotonic_volatile_cluster_param_0];
+; CHECK-NEXT:    ld.volatile.global.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [global_monotonic_volatile_cluster_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [global_monotonic_volatile_cluster_param_2];
+; CHECK-NEXT:    st.volatile.global.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [global_monotonic_volatile_cluster_param_3];
+; CHECK-NEXT:    ld.volatile.global.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [global_monotonic_volatile_cluster_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.volatile.global.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.volatile.global.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.volatile.global.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.volatile.global.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.volatile.global.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.volatile.global.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.volatile.global.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.volatile.global.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.volatile.global.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("cluster") monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.volatile.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("cluster") monotonic, align 1
 
-  ; CHECK: ld.volatile.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("cluster") monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.volatile.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("cluster") monotonic, align 2
 
-  ; CHECK: ld.volatile.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("cluster") monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.volatile.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("cluster") monotonic, align 4
 
-  ; CHECK: ld.volatile.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("cluster") monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.volatile.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("cluster") monotonic, align 8
 
-  ; CHECK: ld.volatile.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("cluster") monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.volatile.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("cluster") monotonic, align 4
 
-  ; CHECK: ld.volatile.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("cluster") monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.volatile.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("cluster") monotonic, align 8
 
   ret void
@@ -552,40 +781,59 @@ define void @global_monotonic_volatile_cluster(ptr addrspace(1) %a, ptr addrspac
 
 ; CHECK-LABEL: global_acq_rel_cluster
 define void @global_acq_rel_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.cluster.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: global_acq_rel_cluster(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_acq_rel_cluster_param_0];
+; CHECK-NEXT:    ld.acquire.cluster.global.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [global_acq_rel_cluster_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [global_acq_rel_cluster_param_2];
+; CHECK-NEXT:    st.release.cluster.global.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [global_acq_rel_cluster_param_3];
+; CHECK-NEXT:    ld.acquire.cluster.global.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [global_acq_rel_cluster_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.release.cluster.global.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.acquire.cluster.global.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.release.cluster.global.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.acquire.cluster.global.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.release.cluster.global.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.acquire.cluster.global.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.release.cluster.global.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.acquire.cluster.global.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.release.cluster.global.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(1) %a syncscope("cluster") acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.cluster.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(1) %a syncscope("cluster") release, align 1
 
-  ; CHECK: ld.acquire.cluster.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(1) %b syncscope("cluster") acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.cluster.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(1) %b syncscope("cluster") release, align 2
 
-  ; CHECK: ld.acquire.cluster.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(1) %c syncscope("cluster") acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.cluster.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(1) %c syncscope("cluster") release, align 4
 
-  ; CHECK: ld.acquire.cluster.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(1) %d syncscope("cluster") acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.cluster.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(1) %d syncscope("cluster") release, align 8
 
-  ; CHECK: ld.acquire.cluster.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(1) %e syncscope("cluster") acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.release.cluster.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(1) %e syncscope("cluster") release, align 4
 
-  ; CHECK: ld.acquire.cluster.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(1) %e syncscope("cluster") acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.cluster.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(1) %e syncscope("cluster") release, align 8
 
   ret void
@@ -593,40 +841,59 @@ define void @global_acq_rel_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, pt
 
 ; CHECK-LABEL: global_acq_rel_volatile_cluster
 define void @global_acq_rel_volatile_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.sys.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: global_acq_rel_volatile_cluster(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_acq_rel_volatile_cluster_param_0];
+; CHECK-NEXT:    ld.acquire.sys.global.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [global_acq_rel_volatile_cluster_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [global_acq_rel_volatile_cluster_param_2];
+; CHECK-NEXT:    st.release.sys.global.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [global_acq_rel_volatile_cluster_param_3];
+; CHECK-NEXT:    ld.acquire.sys.global.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [global_acq_rel_volatile_cluster_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.release.sys.global.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.acquire.sys.global.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.release.sys.global.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.acquire.sys.global.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.release.sys.global.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.acquire.sys.global.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.release.sys.global.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.acquire.sys.global.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.release.sys.global.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("cluster") acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.sys.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("cluster") release, align 1
 
-  ; CHECK: ld.acquire.sys.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("cluster") acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.sys.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("cluster") release, align 2
 
-  ; CHECK: ld.acquire.sys.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("cluster") acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("cluster") release, align 4
 
-  ; CHECK: ld.acquire.sys.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("cluster") acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("cluster") release, align 8
 
-  ; CHECK: ld.acquire.sys.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("cluster") acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("cluster") release, align 4
 
-  ; CHECK: ld.acquire.sys.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("cluster") acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("cluster") release, align 8
 
   ret void
@@ -634,52 +901,71 @@ define void @global_acq_rel_volatile_cluster(ptr addrspace(1) %a, ptr addrspace(
 
 ; CHECK-LABEL: global_seq_cst_cluster
 define void @global_seq_cst_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: fence.sc.cluster
-  ; CHECK: ld.acquire.cluster.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: global_seq_cst_cluster(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_seq_cst_cluster_param_0];
+; CHECK-NEXT:    fence.sc.cluster;
+; CHECK-NEXT:    ld.acquire.cluster.global.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [global_seq_cst_cluster_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [global_seq_cst_cluster_param_2];
+; CHECK-NEXT:    fence.sc.cluster;
+; CHECK-NEXT:    st.release.cluster.global.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [global_seq_cst_cluster_param_3];
+; CHECK-NEXT:    fence.sc.cluster;
+; CHECK-NEXT:    ld.acquire.cluster.global.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [global_seq_cst_cluster_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    fence.sc.cluster;
+; CHECK-NEXT:    st.release.cluster.global.b16 [%rd2], %rs4;
+; CHECK-NEXT:    fence.sc.cluster;
+; CHECK-NEXT:    ld.acquire.cluster.global.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    fence.sc.cluster;
+; CHECK-NEXT:    st.release.cluster.global.b32 [%rd3], %r2;
+; CHECK-NEXT:    fence.sc.cluster;
+; CHECK-NEXT:    ld.acquire.cluster.global.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    fence.sc.cluster;
+; CHECK-NEXT:    st.release.cluster.global.b64 [%rd4], %rd7;
+; CHECK-NEXT:    fence.sc.cluster;
+; CHECK-NEXT:    ld.acquire.cluster.global.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    fence.sc.cluster;
+; CHECK-NEXT:    st.release.cluster.global.b32 [%rd5], %r4;
+; CHECK-NEXT:    fence.sc.cluster;
+; CHECK-NEXT:    ld.acquire.cluster.global.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    fence.sc.cluster;
+; CHECK-NEXT:    st.release.cluster.global.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(1) %a syncscope("cluster") seq_cst, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: fence.sc.cluster
-  ; CHECK: st.release.cluster.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(1) %a syncscope("cluster") seq_cst, align 1
 
-  ; CHECK: fence.sc.cluster
-  ; CHECK: ld.acquire.cluster.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(1) %b syncscope("cluster") seq_cst, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: fence.sc.cluster
-  ; CHECK: st.release.cluster.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(1) %b syncscope("cluster") seq_cst, align 2
 
-  ; CHECK: fence.sc.cluster
-  ; CHECK: ld.acquire.cluster.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(1) %c syncscope("cluster") seq_cst, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: fence.sc.cluster
-  ; CHECK: st.release.cluster.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(1) %c syncscope("cluster") seq_cst, align 4
 
-  ; CHECK: fence.sc.cluster
-  ; CHECK: ld.acquire.cluster.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(1) %d syncscope("cluster") seq_cst, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: fence.sc.cluster
-  ; CHECK: st.release.cluster.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(1) %d syncscope("cluster") seq_cst, align 8
 
-  ; CHECK: fence.sc.cluster
-  ; CHECK: ld.acquire.cluster.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: fence.sc.cluster
-  ; CHECK: st.release.cluster.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 4
 
-  ; CHECK: fence.sc.cluster
-  ; CHECK: ld.acquire.cluster.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: fence.sc.cluster
-  ; CHECK: st.release.cluster.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 8
 
   ret void
@@ -687,52 +973,71 @@ define void @global_seq_cst_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, pt
 
 ; CHECK-LABEL: global_seq_cst_volatile_cluster
 define void @global_seq_cst_volatile_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: global_seq_cst_volatile_cluster(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_seq_cst_volatile_cluster_param_0];
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.global.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [global_seq_cst_volatile_cluster_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [global_seq_cst_volatile_cluster_param_2];
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.global.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [global_seq_cst_volatile_cluster_param_3];
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.global.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [global_seq_cst_volatile_cluster_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.global.b16 [%rd2], %rs4;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.global.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.global.b32 [%rd3], %r2;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.global.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.global.b64 [%rd4], %rd7;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.global.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.global.b32 [%rd5], %r4;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.global.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.global.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("cluster") seq_cst, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("cluster") seq_cst, align 1
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("cluster") seq_cst, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("cluster") seq_cst, align 2
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("cluster") seq_cst, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("cluster") seq_cst, align 4
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("cluster") seq_cst, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("cluster") seq_cst, align 8
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 4
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 8
 
   ret void
@@ -742,40 +1047,59 @@ define void @global_seq_cst_volatile_cluster(ptr addrspace(1) %a, ptr addrspace(
 
 ; CHECK-LABEL: shared_unordered_cluster
 define void @shared_unordered_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: ld.relaxed.cluster.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: shared_unordered_cluster(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_unordered_cluster_param_0];
+; CHECK-NEXT:    ld.relaxed.cluster.shared.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [shared_unordered_cluster_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [shared_unordered_cluster_param_2];
+; CHECK-NEXT:    st.relaxed.cluster.shared.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [shared_unordered_cluster_param_3];
+; CHECK-NEXT:    ld.relaxed.cluster.shared.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [shared_unordered_cluster_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.relaxed.cluster.shared.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.relaxed.cluster.shared.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.relaxed.cluster.shared.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.relaxed.cluster.shared.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.relaxed.cluster.shared.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.relaxed.cluster.shared.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.relaxed.cluster.shared.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.relaxed.cluster.shared.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.relaxed.cluster.shared.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(3) %a syncscope("cluster") unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.relaxed.cluster.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(3) %a syncscope("cluster") unordered, align 1
 
-  ; CHECK: ld.relaxed.cluster.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(3) %b syncscope("cluster") unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.relaxed.cluster.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(3) %b syncscope("cluster") unordered, align 2
 
-  ; CHECK: ld.relaxed.cluster.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(3) %c syncscope("cluster") unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.relaxed.cluster.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(3) %c syncscope("cluster") unordered, align 4
 
-  ; CHECK: ld.relaxed.cluster.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(3) %d syncscope("cluster") unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.relaxed.cluster.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(3) %d syncscope("cluster") unordered, align 8
 
-  ; CHECK: ld.relaxed.cluster.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(3) %e syncscope("cluster") unordered, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.relaxed.cluster.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(3) %e syncscope("cluster") unordered, align 4
 
-  ; CHECK: ld.relaxed.cluster.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(3) %e syncscope("cluster") unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.relaxed.cluster.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(3) %e syncscope("cluster") unordered, align 8
 
   ret void
@@ -783,40 +1107,59 @@ define void @shared_unordered_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b,
 
 ; CHECK-LABEL: shared_unordered_volatile_cluster
 define void @shared_unordered_volatile_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: ld.volatile.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: shared_unordered_volatile_cluster(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_unordered_volatile_cluster_param_0];
+; CHECK-NEXT:    ld.volatile.shared.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [shared_unordered_volatile_cluster_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [shared_unordered_volatile_cluster_param_2];
+; CHECK-NEXT:    st.volatile.shared.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [shared_unordered_volatile_cluster_param_3];
+; CHECK-NEXT:    ld.volatile.shared.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [shared_unordered_volatile_cluster_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.volatile.shared.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.volatile.shared.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.volatile.shared.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.volatile.shared.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.volatile.shared.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.volatile.shared.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.volatile.shared.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.volatile.shared.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.volatile.shared.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("cluster") unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.volatile.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("cluster") unordered, align 1
 
-  ; CHECK: ld.volatile.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("cluster") unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.volatile.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("cluster") unordered, align 2
 
-  ; CHECK: ld.volatile.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("cluster") unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.volatile.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("cluster") unordered, align 4
 
-  ; CHECK: ld.volatile.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("cluster") unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.volatile.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("cluster") unordered, align 8
 
-  ; CHECK: ld.volatile.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("cluster") unordered, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.volatile.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("cluster") unordered, align 4
 
-  ; CHECK: ld.volatile.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("cluster") unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.volatile.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("cluster") unordered, align 8
 
   ret void
@@ -824,40 +1167,59 @@ define void @shared_unordered_volatile_cluster(ptr addrspace(3) %a, ptr addrspac
 
 ; CHECK-LABEL: shared_monotonic_cluster
 define void @shared_monotonic_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: ld.relaxed.cluster.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: shared_monotonic_cluster(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_monotonic_cluster_param_0];
+; CHECK-NEXT:    ld.relaxed.cluster.shared.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [shared_monotonic_cluster_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [shared_monotonic_cluster_param_2];
+; CHECK-NEXT:    st.relaxed.cluster.shared.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [shared_monotonic_cluster_param_3];
+; CHECK-NEXT:    ld.relaxed.cluster.shared.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [shared_monotonic_cluster_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.relaxed.cluster.shared.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.relaxed.cluster.shared.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.relaxed.cluster.shared.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.relaxed.cluster.shared.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.relaxed.cluster.shared.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.relaxed.cluster.shared.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.relaxed.cluster.shared.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.relaxed.cluster.shared.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.relaxed.cluster.shared.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(3) %a syncscope("cluster") monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.relaxed.cluster.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(3) %a syncscope("cluster") monotonic, align 1
 
-  ; CHECK: ld.relaxed.cluster.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(3) %b syncscope("cluster") monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.relaxed.cluster.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(3) %b syncscope("cluster") monotonic, align 2
 
-  ; CHECK: ld.relaxed.cluster.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(3) %c syncscope("cluster") monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.relaxed.cluster.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(3) %c syncscope("cluster") monotonic, align 4
 
-  ; CHECK: ld.relaxed.cluster.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(3) %d syncscope("cluster") monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.relaxed.cluster.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(3) %d syncscope("cluster") monotonic, align 8
 
-  ; CHECK: ld.relaxed.cluster.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(3) %e syncscope("cluster") monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.relaxed.cluster.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(3) %e syncscope("cluster") monotonic, align 4
 
-  ; CHECK: ld.relaxed.cluster.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(3) %e syncscope("cluster") monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.relaxed.cluster.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(3) %e syncscope("cluster") monotonic, align 8
 
   ret void
@@ -865,40 +1227,59 @@ define void @shared_monotonic_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b,
 
 ; CHECK-LABEL: shared_monotonic_volatile_cluster
 define void @shared_monotonic_volatile_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: ld.volatile.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: shared_monotonic_volatile_cluster(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_monotonic_volatile_cluster_param_0];
+; CHECK-NEXT:    ld.volatile.shared.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [shared_monotonic_volatile_cluster_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [shared_monotonic_volatile_cluster_param_2];
+; CHECK-NEXT:    st.volatile.shared.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [shared_monotonic_volatile_cluster_param_3];
+; CHECK-NEXT:    ld.volatile.shared.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [shared_monotonic_volatile_cluster_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.volatile.shared.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.volatile.shared.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.volatile.shared.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.volatile.shared.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.volatile.shared.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.volatile.shared.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.volatile.shared.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.volatile.shared.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.volatile.shared.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("cluster") monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.volatile.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("cluster") monotonic, align 1
 
-  ; CHECK: ld.volatile.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("cluster") monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.volatile.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("cluster") monotonic, align 2
 
-  ; CHECK: ld.volatile.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("cluster") monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.volatile.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("cluster") monotonic, align 4
 
-  ; CHECK: ld.volatile.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("cluster") monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.volatile.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("cluster") monotonic, align 8
 
-  ; CHECK: ld.volatile.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("cluster") monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.volatile.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("cluster") monotonic, align 4
 
-  ; CHECK: ld.volatile.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("cluster") monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.volatile.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("cluster") monotonic, align 8
 
   ret void
@@ -906,40 +1287,59 @@ define void @shared_monotonic_volatile_cluster(ptr addrspace(3) %a, ptr addrspac
 
 ; CHECK-LABEL: shared_acq_rel_cluster
 define void @shared_acq_rel_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.cluster.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: shared_acq_rel_cluster(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_acq_rel_cluster_param_0];
+; CHECK-NEXT:    ld.acquire.cluster.shared.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [shared_acq_rel_cluster_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [shared_acq_rel_cluster_param_2];
+; CHECK-NEXT:    st.release.cluster.shared.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [shared_acq_rel_cluster_param_3];
+; CHECK-NEXT:    ld.acquire.cluster.shared.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [shared_acq_rel_cluster_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.release.cluster.shared.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.acquire.cluster.shared.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.release.cluster.shared.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.acquire.cluster.shared.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.release.cluster.shared.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.acquire.cluster.shared.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.release.cluster.shared.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.acquire.cluster.shared.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.release.cluster.shared.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(3) %a syncscope("cluster") acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.cluster.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(3) %a syncscope("cluster") release, align 1
 
-  ; CHECK: ld.acquire.cluster.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(3) %b syncscope("cluster") acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.cluster.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(3) %b syncscope("cluster") release, align 2
 
-  ; CHECK: ld.acquire.cluster.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(3) %c syncscope("cluster") acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.cluster.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(3) %c syncscope("cluster") release, align 4
 
-  ; CHECK: ld.acquire.cluster.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(3) %d syncscope("cluster") acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.cluster.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(3) %d syncscope("cluster") release, align 8
 
-  ; CHECK: ld.acquire.cluster.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(3) %e syncscope("cluster") acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.release.cluster.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(3) %e syncscope("cluster") release, align 4
 
-  ; CHECK: ld.acquire.cluster.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(3) %e syncscope("cluster") acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.cluster.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(3) %e syncscope("cluster") release, align 8
 
   ret void
@@ -947,40 +1347,59 @@ define void @shared_acq_rel_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, pt
 
 ; CHECK-LABEL: shared_acq_rel_volatile_cluster
 define void @shared_acq_rel_volatile_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.sys.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: shared_acq_rel_volatile_cluster(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_acq_rel_volatile_cluster_param_0];
+; CHECK-NEXT:    ld.acquire.sys.shared.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [shared_acq_rel_volatile_cluster_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [shared_acq_rel_volatile_cluster_param_2];
+; CHECK-NEXT:    st.release.sys.shared.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [shared_acq_rel_volatile_cluster_param_3];
+; CHECK-NEXT:    ld.acquire.sys.shared.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [shared_acq_rel_volatile_cluster_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.release.sys.shared.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.acquire.sys.shared.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.release.sys.shared.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.acquire.sys.shared.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.release.sys.shared.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.acquire.sys.shared.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.release.sys.shared.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.acquire.sys.shared.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.release.sys.shared.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("cluster") acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.sys.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("cluster") release, align 1
 
-  ; CHECK: ld.acquire.sys.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("cluster") acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.sys.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("cluster") release, align 2
 
-  ; CHECK: ld.acquire.sys.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("cluster") acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("cluster") release, align 4
 
-  ; CHECK: ld.acquire.sys.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("cluster") acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("cluster") release, align 8
 
-  ; CHECK: ld.acquire.sys.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("cluster") acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("cluster") release, align 4
 
-  ; CHECK: ld.acquire.sys.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("cluster") acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("cluster") release, align 8
 
   ret void
@@ -988,52 +1407,71 @@ define void @shared_acq_rel_volatile_cluster(ptr addrspace(3) %a, ptr addrspace(
 
 ; CHECK-LABEL: shared_seq_cst_cluster
 define void @shared_seq_cst_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: fence.sc.cluster
-  ; CHECK: ld.acquire.cluster.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: shared_seq_cst_cluster(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_seq_cst_cluster_param_0];
+; CHECK-NEXT:    fence.sc.cluster;
+; CHECK-NEXT:    ld.acquire.cluster.shared.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [shared_seq_cst_cluster_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [shared_seq_cst_cluster_param_2];
+; CHECK-NEXT:    fence.sc.cluster;
+; CHECK-NEXT:    st.release.cluster.shared.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [shared_seq_cst_cluster_param_3];
+; CHECK-NEXT:    fence.sc.cluster;
+; CHECK-NEXT:    ld.acquire.cluster.shared.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [shared_seq_cst_cluster_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    fence.sc.cluster;
+; CHECK-NEXT:    st.release.cluster.shared.b16 [%rd2], %rs4;
+; CHECK-NEXT:    fence.sc.cluster;
+; CHECK-NEXT:    ld.acquire.cluster.shared.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    fence.sc.cluster;
+; CHECK-NEXT:    st.release.cluster.shared.b32 [%rd3], %r2;
+; CHECK-NEXT:    fence.sc.cluster;
+; CHECK-NEXT:    ld.acquire.cluster.shared.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    fence.sc.cluster;
+; CHECK-NEXT:    st.release.cluster.shared.b64 [%rd4], %rd7;
+; CHECK-NEXT:    fence.sc.cluster;
+; CHECK-NEXT:    ld.acquire.cluster.shared.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    fence.sc.cluster;
+; CHECK-NEXT:    st.release.cluster.shared.b32 [%rd5], %r4;
+; CHECK-NEXT:    fence.sc.cluster;
+; CHECK-NEXT:    ld.acquire.cluster.shared.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    fence.sc.cluster;
+; CHECK-NEXT:    st.release.cluster.shared.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(3) %a syncscope("cluster") seq_cst, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: fence.sc.cluster
-  ; CHECK: st.release.cluster.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(3) %a syncscope("cluster") seq_cst, align 1
 
-  ; CHECK: fence.sc.cluster
-  ; CHECK: ld.acquire.cluster.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(3) %b syncscope("cluster") seq_cst, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: fence.sc.cluster
-  ; CHECK: st.release.cluster.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(3) %b syncscope("cluster") seq_cst, align 2
 
-  ; CHECK: fence.sc.cluster
-  ; CHECK: ld.acquire.cluster.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(3) %c syncscope("cluster") seq_cst, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: fence.sc.cluster
-  ; CHECK: st.release.cluster.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(3) %c syncscope("cluster") seq_cst, align 4
 
-  ; CHECK: fence.sc.cluster
-  ; CHECK: ld.acquire.cluster.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(3) %d syncscope("cluster") seq_cst, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: fence.sc.cluster
-  ; CHECK: st.release.cluster.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(3) %d syncscope("cluster") seq_cst, align 8
 
-  ; CHECK: fence.sc.cluster
-  ; CHECK: ld.acquire.cluster.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: fence.sc.cluster
-  ; CHECK: st.release.cluster.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 4
 
-  ; CHECK: fence.sc.cluster
-  ; CHECK: ld.acquire.cluster.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: fence.sc.cluster
-  ; CHECK: st.release.cluster.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 8
 
   ret void
@@ -1041,52 +1479,71 @@ define void @shared_seq_cst_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, pt
 
 ; CHECK-LABEL: shared_seq_cst_volatile_cluster
 define void @shared_seq_cst_volatile_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: shared_seq_cst_volatile_cluster(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_seq_cst_volatile_cluster_param_0];
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.shared.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [shared_seq_cst_volatile_cluster_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [shared_seq_cst_volatile_cluster_param_2];
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.shared.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [shared_seq_cst_volatile_cluster_param_3];
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.shared.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [shared_seq_cst_volatile_cluster_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.shared.b16 [%rd2], %rs4;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.shared.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.shared.b32 [%rd3], %r2;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.shared.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.shared.b64 [%rd4], %rd7;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.shared.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.shared.b32 [%rd5], %r4;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    ld.acquire.sys.shared.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    fence.sc.sys;
+; CHECK-NEXT:    st.release.sys.shared.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("cluster") seq_cst, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("cluster") seq_cst, align 1
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("cluster") seq_cst, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("cluster") seq_cst, align 2
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("cluster") seq_cst, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("cluster") seq_cst, align 4
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("cluster") seq_cst, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("cluster") seq_cst, align 8
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 4
 
-  ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 8
 
   ret void
@@ -1096,40 +1553,59 @@ define void @shared_seq_cst_volatile_cluster(ptr addrspace(3) %a, ptr addrspace(
 
 ; CHECK-LABEL: local_unordered_cluster
 define void @local_unordered_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: local_unordered_cluster(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_unordered_cluster_param_0];
+; CHECK-NEXT:    ld.local.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [local_unordered_cluster_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [local_unordered_cluster_param_2];
+; CHECK-NEXT:    st.local.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [local_unordered_cluster_param_3];
+; CHECK-NEXT:    ld.local.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [local_unordered_cluster_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.local.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.local.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.local.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.local.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.local.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.local.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.local.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.local.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.local.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(5) %a syncscope("cluster") unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(5) %a syncscope("cluster") unordered, align 1
 
-  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(5) %b syncscope("cluster") unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(5) %b syncscope("cluster") unordered, align 2
 
-  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(5) %c syncscope("cluster") unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(5) %c syncscope("cluster") unordered, align 4
 
-  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(5) %d syncscope("cluster") unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(5) %d syncscope("cluster") unordered, align 8
 
-  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(5) %e syncscope("cluster") unordered, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(5) %e syncscope("cluster") unordered, align 4
 
-  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(5) %e syncscope("cluster") unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(5) %e syncscope("cluster") unordered, align 8
 
   ret void
@@ -1137,40 +1613,59 @@ define void @local_unordered_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, p
 
 ; CHECK-LABEL: local_unordered_volatile_cluster
 define void @local_unordered_volatile_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: local_unordered_volatile_cluster(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_unordered_volatile_cluster_param_0];
+; CHECK-NEXT:    ld.local.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [local_unordered_volatile_cluster_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [local_unordered_volatile_cluster_param_2];
+; CHECK-NEXT:    st.local.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [local_unordered_volatile_cluster_param_3];
+; CHECK-NEXT:    ld.local.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [local_unordered_volatile_cluster_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.local.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.local.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.local.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.local.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.local.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.local.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.local.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.local.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.local.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("cluster") unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("cluster") unordered, align 1
 
-  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("cluster") unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("cluster") unordered, align 2
 
-  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("cluster") unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("cluster") unordered, align 4
 
-  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("cluster") unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("cluster") unordered, align 8
 
-  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("cluster") unordered, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("cluster") unordered, align 4
 
-  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("cluster") unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("cluster") unordered, align 8
 
   ret void
@@ -1178,40 +1673,59 @@ define void @local_unordered_volatile_cluster(ptr addrspace(5) %a, ptr addrspace
 
 ; CHECK-LABEL: local_monotonic_cluster
 define void @local_monotonic_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: local_monotonic_cluster(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_monotonic_cluster_param_0];
+; CHECK-NEXT:    ld.local.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [local_monotonic_cluster_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [local_monotonic_cluster_param_2];
+; CHECK-NEXT:    st.local.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [local_monotonic_cluster_param_3];
+; CHECK-NEXT:    ld.local.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [local_monotonic_cluster_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.local.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.local.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.local.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.local.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.local.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.local.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.local.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.local.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.local.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(5) %a syncscope("cluster") monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(5) %a syncscope("cluster") monotonic, align 1
 
-  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(5) %b syncscope("cluster") monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(5) %b syncscope("cluster") monotonic, align 2
 
-  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(5) %c syncscope("cluster") monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(5) %c syncscope("cluster") monotonic, align 4
 
-  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(5) %d syncscope("cluster") monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(5) %d syncscope("cluster") monotonic, align 8
 
-  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(5) %e syncscope("cluster") monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(5) %e syncscope("cluster") monotonic, align 4
 
-  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(5) %e syncscope("cluster") monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(5) %e syncscope("cluster") monotonic, align 8
 
   ret void
@@ -1219,40 +1733,59 @@ define void @local_monotonic_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, p
 
 ; CHECK-LABEL: local_monotonic_volatile_cluster
 define void @local_monotonic_volatile_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: local_monotonic_volatile_cluster(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_monotonic_volatile_cluster_param_0];
+; CHECK-NEXT:    ld.local.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [local_monotonic_volatile_cluster_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [local_monotonic_volatile_cluster_param_2];
+; CHECK-NEXT:    st.local.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [local_monotonic_volatile_cluster_param_3];
+; CHECK-NEXT:    ld.local.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [local_monotonic_volatile_cluster_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.local.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.local.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.local.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.local.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.local.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.local.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.local.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.local.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.local.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("cluster") monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("cluster") monotonic, align 1
 
-  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("cluster") monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("cluster") monotonic, align 2
 
-  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("cluster") monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("cluster") monotonic, align 4
 
-  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("cluster") monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("cluster") monotonic, align 8
 
-  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("cluster") monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("cluster") monotonic, align 4
 
-  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("cluster") monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("cluster") monotonic, align 8
 
   ret void
@@ -1260,40 +1793,59 @@ define void @local_monotonic_volatile_cluster(ptr addrspace(5) %a, ptr addrspace
 
 ; CHECK-LABEL: local_acq_rel_cluster
 define void @local_acq_rel_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: local_acq_rel_cluster(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_acq_rel_cluster_param_0];
+; CHECK-NEXT:    ld.local.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [local_acq_rel_cluster_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [local_acq_rel_cluster_param_2];
+; CHECK-NEXT:    st.local.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [local_acq_rel_cluster_param_3];
+; CHECK-NEXT:    ld.local.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [local_acq_rel_cluster_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.local.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.local.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.local.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.local.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.local.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.local.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.local.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.local.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.local.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(5) %a syncscope("cluster") acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(5) %a syncscope("cluster") release, align 1
 
-  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(5) %b syncscope("cluster") acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(5) %b syncscope("cluster") release, align 2
 
-  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(5) %c syncscope("cluster") acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(5) %c syncscope("cluster") release, align 4
 
-  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(5) %d syncscope("cluster") acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(5) %d syncscope("cluster") release, align 8
 
-  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(5) %e syncscope("cluster") acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(5) %e syncscope("cluster") release, align 4
 
-  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(5) %e syncscope("cluster") acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(5) %e syncscope("cluster") release, align 8
 
   ret void
@@ -1301,40 +1853,59 @@ define void @local_acq_rel_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr
 
 ; CHECK-LABEL: local_acq_rel_volatile_cluster
 define void @local_acq_rel_volatile_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: local_acq_rel_volatile_cluster(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_acq_rel_volatile_cluster_param_0];
+; CHECK-NEXT:    ld.local.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [local_acq_rel_volatile_cluster_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [local_acq_rel_volatile_cluster_param_2];
+; CHECK-NEXT:    st.local.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [local_acq_rel_volatile_cluster_param_3];
+; CHECK-NEXT:    ld.local.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [local_acq_rel_volatile_cluster_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.local.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.local.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.local.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.local.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.local.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.local.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.local.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.local.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.local.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("cluster") acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("cluster") release, align 1
 
-  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("cluster") acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("cluster") release, align 2
 
-  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("cluster") acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("cluster") release, align 4
 
-  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("cluster") acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("cluster") release, align 8
 
-  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("cluster") acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("cluster") release, align 4
 
-  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("cluster") acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("cluster") release, align 8
 
   ret void
@@ -1342,40 +1913,59 @@ define void @local_acq_rel_volatile_cluster(ptr addrspace(5) %a, ptr addrspace(5
 
 ; CHECK-LABEL: local_seq_cst_cluster
 define void @local_seq_cst_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: local_seq_cst_cluster(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_seq_cst_cluster_param_0];
+; CHECK-NEXT:    ld.local.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [local_seq_cst_cluster_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [local_seq_cst_cluster_param_2];
+; CHECK-NEXT:    st.local.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [local_seq_cst_cluster_param_3];
+; CHECK-NEXT:    ld.local.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [local_seq_cst_cluster_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.local.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.local.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.local.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.local.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.local.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.local.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.local.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.local.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.local.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(5) %a syncscope("cluster") seq_cst, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(5) %a syncscope("cluster") seq_cst, align 1
 
-  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(5) %b syncscope("cluster") seq_cst, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(5) %b syncscope("cluster") seq_cst, align 2
 
-  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(5) %c syncscope("cluster") seq_cst, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(5) %c syncscope("cluster") seq_cst, align 4
 
-  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(5) %d syncscope("cluster") seq_cst, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(5) %d syncscope("cluster") seq_cst, align 8
 
-  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 4
 
-  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 8
 
   ret void
@@ -1383,40 +1973,59 @@ define void @local_seq_cst_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr
 
 ; CHECK-LABEL: local_seq_cst_volatile_cluster
 define void @local_seq_cst_volatile_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: local_seq_cst_volatile_cluster(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_seq_cst_volatile_cluster_param_0];
+; CHECK-NEXT:    ld.local.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [local_seq_cst_volatile_cluster_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [local_seq_cst_volatile_cluster_param_2];
+; CHECK-NEXT:    st.local.b8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [local_seq_cst_volatile_cluster_param_3];
+; CHECK-NEXT:    ld.local.b16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd5, [local_seq_cst_volatile_cluster_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.local.b16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.local.b32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.local.b32 [%rd3], %r2;
+; CHECK-NEXT:    ld.local.b64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.local.b64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.local.b32 %r3, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %r4, %r3, 0f3F800000;
+; CHECK-NEXT:    st.local.b32 [%rd5], %r4;
+; CHECK-NEXT:    ld.local.b64 %rd8, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
+; CHECK-NEXT:    st.local.b64 [%rd5], %rd9;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("cluster") seq_cst, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("cluster") seq_cst, align 1
 
-  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("cluster") seq_cst, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("cluster") seq_cst, align 2
 
-  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("cluster") seq_cst, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("cluster") seq_cst, align 4
 
-  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("cluster") seq_cst, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("cluster") seq_cst, align 8
 
-  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 4
 
-  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 8
 
   ret void
diff --git a/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll b/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll
index 306e71eadca70..dfbc2c34b15d4 100644
--- a/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll
@@ -139,16 +139,16 @@ define void @generic_4xi64(ptr %a, ptr %b) {
 define void @generic_8xfloat(ptr %a, ptr %b) {
 ; CHECK-LABEL: generic_8xfloat(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<9>;
+; CHECK-NEXT:    .reg .b32 %r<9>;
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [generic_8xfloat_param_0];
-; CHECK-NEXT:    ld.v4.b32 {%f1, %f2, %f3, %f4}, [%rd1];
-; CHECK-NEXT:    ld.v4.b32 {%f5, %f6, %f7, %f8}, [%rd1+16];
+; CHECK-NEXT:    ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT:    ld.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
 ; CHECK-NEXT:    ld.param.b64 %rd2, [generic_8xfloat_param_1];
-; CHECK-NEXT:    st.v4.b32 [%rd2+16], {%f5, %f6, %f7, %f8};
-; CHECK-NEXT:    st.v4.b32 [%rd2], {%f1, %f2, %f3, %f4};
+; CHECK-NEXT:    st.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
+; CHECK-NEXT:    st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
 ; CHECK-NEXT:    ret;
   %a.load = load <8 x float>, ptr %a
   store <8 x float> %a.load, ptr %b
@@ -158,16 +158,15 @@ define void @generic_8xfloat(ptr %a, ptr %b) {
 define void @generic_4xdouble(ptr %a, ptr %b) {
 ; CHECK-LABEL: generic_4xdouble(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<3>;
-; CHECK-NEXT:    .reg .b64 %fd<5>;
+; CHECK-NEXT:    .reg .b64 %rd<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [generic_4xdouble_param_0];
-; CHECK-NEXT:    ld.v2.b64 {%fd1, %fd2}, [%rd1];
-; CHECK-NEXT:    ld.v2.b64 {%fd3, %fd4}, [%rd1+16];
-; CHECK-NEXT:    ld.param.b64 %rd2, [generic_4xdouble_param_1];
-; CHECK-NEXT:    st.v2.b64 [%rd2+16], {%fd3, %fd4};
-; CHECK-NEXT:    st.v2.b64 [%rd2], {%fd1, %fd2};
+; CHECK-NEXT:    ld.v2.b64 {%rd2, %rd3}, [%rd1];
+; CHECK-NEXT:    ld.v2.b64 {%rd4, %rd5}, [%rd1+16];
+; CHECK-NEXT:    ld.param.b64 %rd6, [generic_4xdouble_param_1];
+; CHECK-NEXT:    st.v2.b64 [%rd6+16], {%rd4, %rd5};
+; CHECK-NEXT:    st.v2.b64 [%rd6], {%rd2, %rd3};
 ; CHECK-NEXT:    ret;
   %a.load = load <4 x double>, ptr %a
   store <4 x double> %a.load, ptr %b
@@ -292,16 +291,16 @@ define void @generic_volatile_4xi64(ptr %a, ptr %b) {
 define void @generic_volatile_8xfloat(ptr %a, ptr %b) {
 ; CHECK-LABEL: generic_volatile_8xfloat(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<9>;
+; CHECK-NEXT:    .reg .b32 %r<9>;
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [generic_volatile_8xfloat_param_0];
-; CHECK-NEXT:    ld.volatile.v4.b32 {%f1, %f2, %f3, %f4}, [%rd1];
-; CHECK-NEXT:    ld.volatile.v4.b32 {%f5, %f6, %f7, %f8}, [%rd1+16];
+; CHECK-NEXT:    ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT:    ld.volatile.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
 ; CHECK-NEXT:    ld.param.b64 %rd2, [generic_volatile_8xfloat_param_1];
-; CHECK-NEXT:    st.volatile.v4.b32 [%rd2+16], {%f5, %f6, %f7, %f8};
-; CHECK-NEXT:    st.volatile.v4.b32 [%rd2], {%f1, %f2, %f3, %f4};
+; CHECK-NEXT:    st.volatile.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
+; CHECK-NEXT:    st.volatile.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <8 x float>, ptr %a
   store volatile <8 x float> %a.load, ptr %b
@@ -311,16 +310,15 @@ define void @generic_volatile_8xfloat(ptr %a, ptr %b) {
 define void @generic_volatile_4xdouble(ptr %a, ptr %b) {
 ; CHECK-LABEL: generic_volatile_4xdouble(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<3>;
-; CHECK-NEXT:    .reg .b64 %fd<5>;
+; CHECK-NEXT:    .reg .b64 %rd<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [generic_volatile_4xdouble_param_0];
-; CHECK-NEXT:    ld.volatile.v2.b64 {%fd1, %fd2}, [%rd1];
-; CHECK-NEXT:    ld.volatile.v2.b64 {%fd3, %fd4}, [%rd1+16];
-; CHECK-NEXT:    ld.param.b64 %rd2, [generic_volatile_4xdouble_param_1];
-; CHECK-NEXT:    st.volatile.v2.b64 [%rd2+16], {%fd3, %fd4};
-; CHECK-NEXT:    st.volatile.v2.b64 [%rd2], {%fd1, %fd2};
+; CHECK-NEXT:    ld.volatile.v2.b64 {%rd2, %rd3}, [%rd1];
+; CHECK-NEXT:    ld.volatile.v2.b64 {%rd4, %rd5}, [%rd1+16];
+; CHECK-NEXT:    ld.param.b64 %rd6, [generic_volatile_4xdouble_param_1];
+; CHECK-NEXT:    st.volatile.v2.b64 [%rd6+16], {%rd4, %rd5};
+; CHECK-NEXT:    st.volatile.v2.b64 [%rd6], {%rd2, %rd3};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <4 x double>, ptr %a
   store volatile <4 x double> %a.load, ptr %b
@@ -518,28 +516,28 @@ define void @global_4xi64(ptr addrspace(1) %a, ptr addrspace(1) %b) {
 define void @global_8xfloat(ptr addrspace(1) %a, ptr addrspace(1) %b) {
 ; SM90-LABEL: global_8xfloat(
 ; SM90:       {
-; SM90-NEXT:    .reg .b32 %f<9>;
+; SM90-NEXT:    .reg .b32 %r<9>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
 ; SM90-NEXT:    ld.param.b64 %rd1, [global_8xfloat_param_0];
-; SM90-NEXT:    ld.global.v4.b32 {%f1, %f2, %f3, %f4}, [%rd1];
-; SM90-NEXT:    ld.global.v4.b32 {%f5, %f6, %f7, %f8}, [%rd1+16];
+; SM90-NEXT:    ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; SM90-NEXT:    ld.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
 ; SM90-NEXT:    ld.param.b64 %rd2, [global_8xfloat_param_1];
-; SM90-NEXT:    st.global.v4.b32 [%rd2+16], {%f5, %f6, %f7, %f8};
-; SM90-NEXT:    st.global.v4.b32 [%rd2], {%f1, %f2, %f3, %f4};
+; SM90-NEXT:    st.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
+; SM90-NEXT:    st.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
 ; SM90-NEXT:    ret;
 ;
 ; SM100-LABEL: global_8xfloat(
 ; SM100:       {
-; SM100-NEXT:    .reg .b32 %f<9>;
+; SM100-NEXT:    .reg .b32 %r<9>;
 ; SM100-NEXT:    .reg .b64 %rd<3>;
 ; SM100-EMPTY:
 ; SM100-NEXT:  // %bb.0:
 ; SM100-NEXT:    ld.param.b64 %rd1, [global_8xfloat_param_0];
-; SM100-NEXT:    ld.global.v8.b32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [%rd1];
+; SM100-NEXT:    ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
 ; SM100-NEXT:    ld.param.b64 %rd2, [global_8xfloat_param_1];
-; SM100-NEXT:    st.global.v8.b32 [%rd2], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8};
+; SM100-NEXT:    st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
 ; SM100-NEXT:    ret;
   %a.load = load <8 x float>, ptr addrspace(1) %a
   store <8 x float> %a.load, ptr addrspace(1) %b
@@ -549,28 +547,26 @@ define void @global_8xfloat(ptr addrspace(1) %a, ptr addrspace(1) %b) {
 define void @global_4xdouble(ptr addrspace(1) %a, ptr addrspace(1) %b) {
 ; SM90-LABEL: global_4xdouble(
 ; SM90:       {
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-NEXT:    .reg .b64 %fd<5>;
+; SM90-NEXT:    .reg .b64 %rd<7>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
 ; SM90-NEXT:    ld.param.b64 %rd1, [global_4xdouble_param_0];
-; SM90-NEXT:    ld.global.v2.b64 {%fd1, %fd2}, [%rd1];
-; SM90-NEXT:    ld.global.v2.b64 {%fd3, %fd4}, [%rd1+16];
-; SM90-NEXT:    ld.param.b64 %rd2, [global_4xdouble_param_1];
-; SM90-NEXT:    st.global.v2.b64 [%rd2+16], {%fd3, %fd4};
-; SM90-NEXT:    st.global.v2.b64 [%rd2], {%fd1, %fd2};
+; SM90-NEXT:    ld.global.v2.b64 {%rd2, %rd3}, [%rd1];
+; SM90-NEXT:    ld.global.v2.b64 {%rd4, %rd5}, [%rd1+16];
+; SM90-NEXT:    ld.param.b64 %rd6, [global_4xdouble_param_1];
+; SM90-NEXT:    st.global.v2.b64 [%rd6+16], {%rd4, %rd5};
+; SM90-NEXT:    st.global.v2.b64 [%rd6], {%rd2, %rd3};
 ; SM90-NEXT:    ret;
 ;
 ; SM100-LABEL: global_4xdouble(
 ; SM100:       {
-; SM100-NEXT:    .reg .b64 %rd<3>;
-; SM100-NEXT:    .reg .b64 %fd<5>;
+; SM100-NEXT:    .reg .b64 %rd<7>;
 ; SM100-EMPTY:
 ; SM100-NEXT:  // %bb.0:
 ; SM100-NEXT:    ld.param.b64 %rd1, [global_4xdouble_param_0];
-; SM100-NEXT:    ld.global.v4.b64 {%fd1, %fd2, %fd3, %fd4}, [%rd1];
-; SM100-NEXT:    ld.param.b64 %rd2, [global_4xdouble_param_1];
-; SM100-NEXT:    st.global.v4.b64 [%rd2], {%fd1, %fd2, %fd3, %fd4};
+; SM100-NEXT:    ld.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1];
+; SM100-NEXT:    ld.param.b64 %rd6, [global_4xdouble_param_1];
+; SM100-NEXT:    st.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5};
 ; SM100-NEXT:    ret;
   %a.load = load <4 x double>, ptr addrspace(1) %a
   store <4 x double> %a.load, ptr addrspace(1) %b
@@ -766,28 +762,28 @@ define void @global_volatile_4xi64(ptr addrspace(1) %a, ptr addrspace(1) %b) {
 define void @global_volatile_8xfloat(ptr addrspace(1) %a, ptr addrspace(1) %b) {
 ; SM90-LABEL: global_volatile_8xfloat(
 ; SM90:       {
-; SM90-NEXT:    .reg .b32 %f<9>;
+; SM90-NEXT:    .reg .b32 %r<9>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
 ; SM90-NEXT:    ld.param.b64 %rd1, [global_volatile_8xfloat_param_0];
-; SM90-NEXT:    ld.volatile.global.v4.b32 {%f1, %f2, %f3, %f4}, [%rd1];
-; SM90-NEXT:    ld.volatile.global.v4.b32 {%f5, %f6, %f7, %f8}, [%rd1+16];
+; SM90-NEXT:    ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; SM90-NEXT:    ld.volatile.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
 ; SM90-NEXT:    ld.param.b64 %rd2, [global_volatile_8xfloat_param_1];
-; SM90-NEXT:    st.volatile.global.v4.b32 [%rd2+16], {%f5, %f6, %f7, %f8};
-; SM90-NEXT:    st.volatile.global.v4.b32 [%rd2], {%f1, %f2, %f3, %f4};
+; SM90-NEXT:    st.volatile.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
+; SM90-NEXT:    st.volatile.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
 ; SM90-NEXT:    ret;
 ;
 ; SM100-LABEL: global_volatile_8xfloat(
 ; SM100:       {
-; SM100-NEXT:    .reg .b32 %f<9>;
+; SM100-NEXT:    .reg .b32 %r<9>;
 ; SM100-NEXT:    .reg .b64 %rd<3>;
 ; SM100-EMPTY:
 ; SM100-NEXT:  // %bb.0:
 ; SM100-NEXT:    ld.param.b64 %rd1, [global_volatile_8xfloat_param_0];
-; SM100-NEXT:    ld.volatile.global.v8.b32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [%rd1];
+; SM100-NEXT:    ld.volatile.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
 ; SM100-NEXT:    ld.param.b64 %rd2, [global_volatile_8xfloat_param_1];
-; SM100-NEXT:    st.volatile.global.v8.b32 [%rd2], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8};
+; SM100-NEXT:    st.volatile.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
 ; SM100-NEXT:    ret;
   %a.load = load volatile <8 x float>, ptr addrspace(1) %a
   store volatile <8 x float> %a.load, ptr addrspace(1) %b
@@ -797,28 +793,26 @@ define void @global_volatile_8xfloat(ptr addrspace(1) %a, ptr addrspace(1) %b) {
 define void @global_volatile_4xdouble(ptr addrspace(1) %a, ptr addrspace(1) %b) {
 ; SM90-LABEL: global_volatile_4xdouble(
 ; SM90:       {
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-NEXT:    .reg .b64 %fd<5>;
+; SM90-NEXT:    .reg .b64 %rd<7>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
 ; SM90-NEXT:    ld.param.b64 %rd1, [global_volatile_4xdouble_param_0];
-; SM90-NEXT:    ld.volatile.global.v2.b64 {%fd1, %fd2}, [%rd1];
-; SM90-NEXT:    ld.volatile.global.v2.b64 {%fd3, %fd4}, [%rd1+16];
-; SM90-NEXT:    ld.param.b64 %rd2, [global_volatile_4xdouble_param_1];
-; SM90-NEXT:    st.volatile.global.v2.b64 [%rd2+16], {%fd3, %fd4};
-; SM90-NEXT:    st.volatile.global.v2.b64 [%rd2], {%fd1, %fd2};
+; SM90-NEXT:    ld.volatile.global.v2.b64 {%rd2, %rd3}, [%rd1];
+; SM90-NEXT:    ld.volatile.global.v2.b64 {%rd4, %rd5}, [%rd1+16];
+; SM90-NEXT:    ld.param.b64 %rd6, [global_volatile_4xdouble_param_1];
+; SM90-NEXT:    st.volatile.global.v2.b64 [%rd6+16], {%rd4, %rd5};
+; SM90-NEXT:    st.volatile.global.v2.b64 [%rd6], {%rd2, %rd3};
 ; SM90-NEXT:    ret;
 ;
 ; SM100-LABEL: global_volatile_4xdouble(
 ; SM100:       {
-; SM100-NEXT:    .reg .b64 %rd<3>;
-; SM100-NEXT:    .reg .b64 %fd<5>;
+; SM100-NEXT:    .reg .b64 %rd<7>;
 ; SM100-EMPTY:
 ; SM100-NEXT:  // %bb.0:
 ; SM100-NEXT:    ld.param.b64 %rd1, [global_volatile_4xdouble_param_0];
-; SM100-NEXT:    ld.volatile.global.v4.b64 {%fd1, %fd2, %fd3, %fd4}, [%rd1];
-; SM100-NEXT:    ld.param.b64 %rd2, [global_volatile_4xdouble_param_1];
-; SM100-NEXT:    st.volatile.global.v4.b64 [%rd2], {%fd1, %fd2, %fd3, %fd4};
+; SM100-NEXT:    ld.volatile.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1];
+; SM100-NEXT:    ld.param.b64 %rd6, [global_volatile_4xdouble_param_1];
+; SM100-NEXT:    st.volatile.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5};
 ; SM100-NEXT:    ret;
   %a.load = load volatile <4 x double>, ptr addrspace(1) %a
   store volatile <4 x double> %a.load, ptr addrspace(1) %b
@@ -945,16 +939,16 @@ define void @shared_4xi64(ptr addrspace(3) %a, ptr addrspace(3) %b) {
 define void @shared_8xfloat(ptr addrspace(3) %a, ptr addrspace(3) %b) {
 ; CHECK-LABEL: shared_8xfloat(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<9>;
+; CHECK-NEXT:    .reg .b32 %r<9>;
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [shared_8xfloat_param_0];
-; CHECK-NEXT:    ld.shared.v4.b32 {%f1, %f2, %f3, %f4}, [%rd1];
-; CHECK-NEXT:    ld.shared.v4.b32 {%f5, %f6, %f7, %f8}, [%rd1+16];
+; CHECK-NEXT:    ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT:    ld.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
 ; CHECK-NEXT:    ld.param.b64 %rd2, [shared_8xfloat_param_1];
-; CHECK-NEXT:    st.shared.v4.b32 [%rd2+16], {%f5, %f6, %f7, %f8};
-; CHECK-NEXT:    st.shared.v4.b32 [%rd2], {%f1, %f2, %f3, %f4};
+; CHECK-NEXT:    st.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
+; CHECK-NEXT:    st.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
 ; CHECK-NEXT:    ret;
   %a.load = load <8 x float>, ptr addrspace(3) %a
   store <8 x float> %a.load, ptr addrspace(3) %b
@@ -964,16 +958,15 @@ define void @shared_8xfloat(ptr addrspace(3) %a, ptr addrspace(3) %b) {
 define void @shared_4xdouble(ptr addrspace(3) %a, ptr addrspace(3) %b) {
 ; CHECK-LABEL: shared_4xdouble(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<3>;
-; CHECK-NEXT:    .reg .b64 %fd<5>;
+; CHECK-NEXT:    .reg .b64 %rd<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [shared_4xdouble_param_0];
-; CHECK-NEXT:    ld.shared.v2.b64 {%fd1, %fd2}, [%rd1];
-; CHECK-NEXT:    ld.shared.v2.b64 {%fd3, %fd4}, [%rd1+16];
-; CHECK-NEXT:    ld.param.b64 %rd2, [shared_4xdouble_param_1];
-; CHECK-NEXT:    st.shared.v2.b64 [%rd2+16], {%fd3, %fd4};
-; CHECK-NEXT:    st.shared.v2.b64 [%rd2], {%fd1, %fd2};
+; CHECK-NEXT:    ld.shared.v2.b64 {%rd2, %rd3}, [%rd1];
+; CHECK-NEXT:    ld.shared.v2.b64 {%rd4, %rd5}, [%rd1+16];
+; CHECK-NEXT:    ld.param.b64 %rd6, [shared_4xdouble_param_1];
+; CHECK-NEXT:    st.shared.v2.b64 [%rd6+16], {%rd4, %rd5};
+; CHECK-NEXT:    st.shared.v2.b64 [%rd6], {%rd2, %rd3};
 ; CHECK-NEXT:    ret;
   %a.load = load <4 x double>, ptr addrspace(3) %a
   store <4 x double> %a.load, ptr addrspace(3) %b
@@ -1098,16 +1091,16 @@ define void @shared_volatile_4xi64(ptr addrspace(3) %a, ptr addrspace(3) %b) {
 define void @shared_volatile_8xfloat(ptr addrspace(3) %a, ptr addrspace(3) %b) {
 ; CHECK-LABEL: shared_volatile_8xfloat(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<9>;
+; CHECK-NEXT:    .reg .b32 %r<9>;
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [shared_volatile_8xfloat_param_0];
-; CHECK-NEXT:    ld.volatile.shared.v4.b32 {%f1, %f2, %f3, %f4}, [%rd1];
-; CHECK-NEXT:    ld.volatile.shared.v4.b32 {%f5, %f6, %f7, %f8}, [%rd1+16];
+; CHECK-NEXT:    ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT:    ld.volatile.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
 ; CHECK-NEXT:    ld.param.b64 %rd2, [shared_volatile_8xfloat_param_1];
-; CHECK-NEXT:    st.volatile.shared.v4.b32 [%rd2+16], {%f5, %f6, %f7, %f8};
-; CHECK-NEXT:    st.volatile.shared.v4.b32 [%rd2], {%f1, %f2, %f3, %f4};
+; CHECK-NEXT:    st.volatile.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
+; CHECK-NEXT:    st.volatile.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <8 x float>, ptr addrspace(3) %a
   store volatile <8 x float> %a.load, ptr addrspace(3) %b
@@ -1117,16 +1110,15 @@ define void @shared_volatile_8xfloat(ptr addrspace(3) %a, ptr addrspace(3) %b) {
 define void @shared_volatile_4xdouble(ptr addrspace(3) %a, ptr addrspace(3) %b) {
 ; CHECK-LABEL: shared_volatile_4xdouble(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<3>;
-; CHECK-NEXT:    .reg .b64 %fd<5>;
+; CHECK-NEXT:    .reg .b64 %rd<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [shared_volatile_4xdouble_param_0];
-; CHECK-NEXT:    ld.volatile.shared.v2.b64 {%fd1, %fd2}, [%rd1];
-; CHECK-NEXT:    ld.volatile.shared.v2.b64 {%fd3, %fd4}, [%rd1+16];
-; CHECK-NEXT:    ld.param.b64 %rd2, [shared_volatile_4xdouble_param_1];
-; CHECK-NEXT:    st.volatile.shared.v2.b64 [%rd2+16], {%fd3, %fd4};
-; CHECK-NEXT:    st.volatile.shared.v2.b64 [%rd2], {%fd1, %fd2};
+; CHECK-NEXT:    ld.volatile.shared.v2.b64 {%rd2, %rd3}, [%rd1];
+; CHECK-NEXT:    ld.volatile.shared.v2.b64 {%rd4, %rd5}, [%rd1+16];
+; CHECK-NEXT:    ld.param.b64 %rd6, [shared_volatile_4xdouble_param_1];
+; CHECK-NEXT:    st.volatile.shared.v2.b64 [%rd6+16], {%rd4, %rd5};
+; CHECK-NEXT:    st.volatile.shared.v2.b64 [%rd6], {%rd2, %rd3};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <4 x double>, ptr addrspace(3) %a
   store volatile <4 x double> %a.load, ptr addrspace(3) %b
@@ -1253,16 +1245,16 @@ define void @local_4xi64(ptr addrspace(5) %a, ptr addrspace(5) %b) {
 define void @local_8xfloat(ptr addrspace(5) %a, ptr addrspace(5) %b) {
 ; CHECK-LABEL: local_8xfloat(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<9>;
+; CHECK-NEXT:    .reg .b32 %r<9>;
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [local_8xfloat_param_0];
-; CHECK-NEXT:    ld.local.v4.b32 {%f1, %f2, %f3, %f4}, [%rd1];
-; CHECK-NEXT:    ld.local.v4.b32 {%f5, %f6, %f7, %f8}, [%rd1+16];
+; CHECK-NEXT:    ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT:    ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
 ; CHECK-NEXT:    ld.param.b64 %rd2, [local_8xfloat_param_1];
-; CHECK-NEXT:    st.local.v4.b32 [%rd2+16], {%f5, %f6, %f7, %f8};
-; CHECK-NEXT:    st.local.v4.b32 [%rd2], {%f1, %f2, %f3, %f4};
+; CHECK-NEXT:    st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
+; CHECK-NEXT:    st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
 ; CHECK-NEXT:    ret;
   %a.load = load <8 x float>, ptr addrspace(5) %a
   store <8 x float> %a.load, ptr addrspace(5) %b
@@ -1272,16 +1264,15 @@ define void @local_8xfloat(ptr addrspace(5) %a, ptr addrspace(5) %b) {
 define void @local_4xdouble(ptr addrspace(5) %a, ptr addrspace(5) %b) {
 ; CHECK-LABEL: local_4xdouble(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<3>;
-; CHECK-NEXT:    .reg .b64 %fd<5>;
+; CHECK-NEXT:    .reg .b64 %rd<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [local_4xdouble_param_0];
-; CHECK-NEXT:    ld.local.v2.b64 {%fd1, %fd2}, [%rd1];
-; CHECK-NEXT:    ld.local.v2.b64 {%fd3, %fd4}, [%rd1+16];
-; CHECK-NEXT:    ld.param.b64 %rd2, [local_4xdouble_param_1];
-; CHECK-NEXT:    st.local.v2.b64 [%rd2+16], {%fd3, %fd4};
-; CHECK-NEXT:    st.local.v2.b64 [%rd2], {%fd1, %fd2};
+; CHECK-NEXT:    ld.local.v2.b64 {%rd2, %rd3}, [%rd1];
+; CHECK-NEXT:    ld.local.v2.b64 {%rd4, %rd5}, [%rd1+16];
+; CHECK-NEXT:    ld.param.b64 %rd6, [local_4xdouble_param_1];
+; CHECK-NEXT:    st.local.v2.b64 [%rd6+16], {%rd4, %rd5};
+; CHECK-NEXT:    st.local.v2.b64 [%rd6], {%rd2, %rd3};
 ; CHECK-NEXT:    ret;
   %a.load = load <4 x double>, ptr addrspace(5) %a
   store <4 x double> %a.load, ptr addrspace(5) %b
@@ -1406,16 +1397,16 @@ define void @local_volatile_4xi64(ptr addrspace(5) %a, ptr addrspace(5) %b) {
 define void @local_volatile_8xfloat(ptr addrspace(5) %a, ptr addrspace(5) %b) {
 ; CHECK-LABEL: local_volatile_8xfloat(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<9>;
+; CHECK-NEXT:    .reg .b32 %r<9>;
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [local_volatile_8xfloat_param_0];
-; CHECK-NEXT:    ld.local.v4.b32 {%f1, %f2, %f3, %f4}, [%rd1];
-; CHECK-NEXT:    ld.local.v4.b32 {%f5, %f6, %f7, %f8}, [%rd1+16];
+; CHECK-NEXT:    ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT:    ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
 ; CHECK-NEXT:    ld.param.b64 %rd2, [local_volatile_8xfloat_param_1];
-; CHECK-NEXT:    st.local.v4.b32 [%rd2+16], {%f5, %f6, %f7, %f8};
-; CHECK-NEXT:    st.local.v4.b32 [%rd2], {%f1, %f2, %f3, %f4};
+; CHECK-NEXT:    st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
+; CHECK-NEXT:    st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <8 x float>, ptr addrspace(5) %a
   store volatile <8 x float> %a.load, ptr addrspace(5) %b
@@ -1425,16 +1416,15 @@ define void @local_volatile_8xfloat(ptr addrspace(5) %a, ptr addrspace(5) %b) {
 define void @local_volatile_4xdouble(ptr addrspace(5) %a, ptr addrspace(5) %b) {
 ; CHECK-LABEL: local_volatile_4xdouble(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<3>;
-; CHECK-NEXT:    .reg .b64 %fd<5>;
+; CHECK-NEXT:    .reg .b64 %rd<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [local_volatile_4xdouble_param_0];
-; CHECK-NEXT:    ld.local.v2.b64 {%fd1, %fd2}, [%rd1];
-; CHECK-NEXT:    ld.local.v2.b64 {%fd3, %fd4}, [%rd1+16];
-; CHECK-NEXT:    ld.param.b64 %rd2, [local_volatile_4xdouble_param_1];
-; CHECK-NEXT:    st.local.v2.b64 [%rd2+16], {%fd3, %fd4};
-; CHECK-NEXT:    st.local.v2.b64 [%rd2], {%fd1, %fd2};
+; CHECK-NEXT:    ld.local.v2.b64 {%rd2, %rd3}, [%rd1];
+; CHECK-NEXT:    ld.local.v2.b64 {%rd4, %rd5}, [%rd1+16];
+; CHECK-NEXT:    ld.param.b64 %rd6, [local_volatile_4xdouble_param_1];
+; CHECK-NEXT:    st.local.v2.b64 [%rd6+16], {%rd4, %rd5};
+; CHECK-NEXT:    st.local.v2.b64 [%rd6], {%rd2, %rd3};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <4 x double>, ptr addrspace(5) %a
   store volatile <4 x double> %a.load, ptr addrspace(5) %b
diff --git a/llvm/test/CodeGen/NVPTX/load-store-vectors.ll b/llvm/test/CodeGen/NVPTX/load-store-vectors.ll
index 2b5553a77fe98..9e7e940ab5a75 100644
--- a/llvm/test/CodeGen/NVPTX/load-store-vectors.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store-vectors.ll
@@ -371,15 +371,15 @@ define void @generic_2xi64(ptr %a) {
 define void @generic_2xfloat(ptr %a) {
 ; CHECK-LABEL: generic_2xfloat(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [generic_2xfloat_param_0];
-; CHECK-NEXT:    ld.v2.b32 {%f1, %f2}, [%rd1];
-; CHECK-NEXT:    add.rn.f32 %f3, %f2, 0f3F800000;
-; CHECK-NEXT:    add.rn.f32 %f4, %f1, 0f3F800000;
-; CHECK-NEXT:    st.v2.b32 [%rd1], {%f4, %f3};
+; CHECK-NEXT:    ld.v2.b32 {%r1, %r2}, [%rd1];
+; CHECK-NEXT:    add.rn.f32 %r3, %r2, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %r4, %r1, 0f3F800000;
+; CHECK-NEXT:    st.v2.b32 [%rd1], {%r4, %r3};
 ; CHECK-NEXT:    ret;
   %a.load = load <2 x float>, ptr %a
   %a.add = fadd <2 x float> %a.load, <float 1., float 1.>
@@ -390,17 +390,17 @@ define void @generic_2xfloat(ptr %a) {
 define void @generic_4xfloat(ptr %a) {
 ; CHECK-LABEL: generic_4xfloat(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<9>;
+; CHECK-NEXT:    .reg .b32 %r<9>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [generic_4xfloat_param_0];
-; CHECK-NEXT:    ld.v4.b32 {%f1, %f2, %f3, %f4}, [%rd1];
-; CHECK-NEXT:    add.rn.f32 %f5, %f4, 0f3F800000;
-; CHECK-NEXT:    add.rn.f32 %f6, %f3, 0f3F800000;
-; CHECK-NEXT:    add.rn.f32 %f7, %f2, 0f3F800000;
-; CHECK-NEXT:    add.rn.f32 %f8, %f1, 0f3F800000;
-; CHECK-NEXT:    st.v4.b32 [%rd1], {%f8, %f7, %f6, %f5};
+; CHECK-NEXT:    ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT:    add.rn.f32 %r5, %r4, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %r6, %r3, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %r7, %r2, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %r8, %r1, 0f3F800000;
+; CHECK-NEXT:    st.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
 ; CHECK-NEXT:    ret;
   %a.load = load <4 x float>, ptr %a
   %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.>
@@ -411,15 +411,14 @@ define void @generic_4xfloat(ptr %a) {
 define void @generic_2xdouble(ptr %a) {
 ; CHECK-LABEL: generic_2xdouble(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<2>;
-; CHECK-NEXT:    .reg .b64 %fd<5>;
+; CHECK-NEXT:    .reg .b64 %rd<6>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [generic_2xdouble_param_0];
-; CHECK-NEXT:    ld.v2.b64 {%fd1, %fd2}, [%rd1];
-; CHECK-NEXT:    add.rn.f64 %fd3, %fd2, 0d3FF0000000000000;
-; CHECK-NEXT:    add.rn.f64 %fd4, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT:    st.v2.b64 [%rd1], {%fd4, %fd3};
+; CHECK-NEXT:    ld.v2.b64 {%rd2, %rd3}, [%rd1];
+; CHECK-NEXT:    add.rn.f64 %rd4, %rd3, 0d3FF0000000000000;
+; CHECK-NEXT:    add.rn.f64 %rd5, %rd2, 0d3FF0000000000000;
+; CHECK-NEXT:    st.v2.b64 [%rd1], {%rd5, %rd4};
 ; CHECK-NEXT:    ret;
   %a.load = load <2 x double>, ptr %a
   %a.add = fadd <2 x double> %a.load, <double 1., double 1.>
@@ -792,15 +791,15 @@ define void @generic_volatile_2xi64(ptr %a) {
 define void @generic_volatile_2xfloat(ptr %a) {
 ; CHECK-LABEL: generic_volatile_2xfloat(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [generic_volatile_2xfloat_param_0];
-; CHECK-NEXT:    ld.volatile.v2.b32 {%f1, %f2}, [%rd1];
-; CHECK-NEXT:    add.rn.f32 %f3, %f2, 0f3F800000;
-; CHECK-NEXT:    add.rn.f32 %f4, %f1, 0f3F800000;
-; CHECK-NEXT:    st.volatile.v2.b32 [%rd1], {%f4, %f3};
+; CHECK-NEXT:    ld.volatile.v2.b32 {%r1, %r2}, [%rd1];
+; CHECK-NEXT:    add.rn.f32 %r3, %r2, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %r4, %r1, 0f3F800000;
+; CHECK-NEXT:    st.volatile.v2.b32 [%rd1], {%r4, %r3};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <2 x float>, ptr %a
   %a.add = fadd <2 x float> %a.load, <float 1., float 1.>
@@ -811,17 +810,17 @@ define void @generic_volatile_2xfloat(ptr %a) {
 define void @generic_volatile_4xfloat(ptr %a) {
 ; CHECK-LABEL: generic_volatile_4xfloat(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<9>;
+; CHECK-NEXT:    .reg .b32 %r<9>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [generic_volatile_4xfloat_param_0];
-; CHECK-NEXT:    ld.volatile.v4.b32 {%f1, %f2, %f3, %f4}, [%rd1];
-; CHECK-NEXT:    add.rn.f32 %f5, %f4, 0f3F800000;
-; CHECK-NEXT:    add.rn.f32 %f6, %f3, 0f3F800000;
-; CHECK-NEXT:    add.rn.f32 %f7, %f2, 0f3F800000;
-; CHECK-NEXT:    add.rn.f32 %f8, %f1, 0f3F800000;
-; CHECK-NEXT:    st.volatile.v4.b32 [%rd1], {%f8, %f7, %f6, %f5};
+; CHECK-NEXT:    ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT:    add.rn.f32 %r5, %r4, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %r6, %r3, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %r7, %r2, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %r8, %r1, 0f3F800000;
+; CHECK-NEXT:    st.volatile.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <4 x float>, ptr %a
   %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.>
@@ -832,15 +831,14 @@ define void @generic_volatile_4xfloat(ptr %a) {
 define void @generic_volatile_2xdouble(ptr %a) {
 ; CHECK-LABEL: generic_volatile_2xdouble(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<2>;
-; CHECK-NEXT:    .reg .b64 %fd<5>;
+; CHECK-NEXT:    .reg .b64 %rd<6>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [generic_volatile_2xdouble_param_0];
-; CHECK-NEXT:    ld.volatile.v2.b64 {%fd1, %fd2}, [%rd1];
-; CHECK-NEXT:    add.rn.f64 %fd3, %fd2, 0d3FF0000000000000;
-; CHECK-NEXT:    add.rn.f64 %fd4, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT:    st.volatile.v2.b64 [%rd1], {%fd4, %fd3};
+; CHECK-NEXT:    ld.volatile.v2.b64 {%rd2, %rd3}, [%rd1];
+; CHECK-NEXT:    add.rn.f64 %rd4, %rd3, 0d3FF0000000000000;
+; CHECK-NEXT:    add.rn.f64 %rd5, %rd2, 0d3FF0000000000000;
+; CHECK-NEXT:    st.volatile.v2.b64 [%rd1], {%rd5, %rd4};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <2 x double>, ptr %a
   %a.add = fadd <2 x double> %a.load, <double 1., double 1.>
@@ -1196,15 +1194,15 @@ define void @global_2xi64(ptr addrspace(1) %a) {
 define void @global_2xfloat(ptr addrspace(1) %a) {
 ; CHECK-LABEL: global_2xfloat(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [global_2xfloat_param_0];
-; CHECK-NEXT:    ld.global.v2.b32 {%f1, %f2}, [%rd1];
-; CHECK-NEXT:    add.rn.f32 %f3, %f2, 0f3F800000;
-; CHECK-NEXT:    add.rn.f32 %f4, %f1, 0f3F800000;
-; CHECK-NEXT:    st.global.v2.b32 [%rd1], {%f4, %f3};
+; CHECK-NEXT:    ld.global.v2.b32 {%r1, %r2}, [%rd1];
+; CHECK-NEXT:    add.rn.f32 %r3, %r2, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %r4, %r1, 0f3F800000;
+; CHECK-NEXT:    st.global.v2.b32 [%rd1], {%r4, %r3};
 ; CHECK-NEXT:    ret;
   %a.load = load <2 x float>, ptr addrspace(1) %a
   %a.add = fadd <2 x float> %a.load, <float 1., float 1.>
@@ -1215,17 +1213,17 @@ define void @global_2xfloat(ptr addrspace(1) %a) {
 define void @global_4xfloat(ptr addrspace(1) %a) {
 ; CHECK-LABEL: global_4xfloat(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<9>;
+; CHECK-NEXT:    .reg .b32 %r<9>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [global_4xfloat_param_0];
-; CHECK-NEXT:    ld.global.v4.b32 {%f1, %f2, %f3, %f4}, [%rd1];
-; CHECK-NEXT:    add.rn.f32 %f5, %f4, 0f3F800000;
-; CHECK-NEXT:    add.rn.f32 %f6, %f3, 0f3F800000;
-; CHECK-NEXT:    add.rn.f32 %f7, %f2, 0f3F800000;
-; CHECK-NEXT:    add.rn.f32 %f8, %f1, 0f3F800000;
-; CHECK-NEXT:    st.global.v4.b32 [%rd1], {%f8, %f7, %f6, %f5};
+; CHECK-NEXT:    ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT:    add.rn.f32 %r5, %r4, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %r6, %r3, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %r7, %r2, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %r8, %r1, 0f3F800000;
+; CHECK-NEXT:    st.global.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
 ; CHECK-NEXT:    ret;
   %a.load = load <4 x float>, ptr addrspace(1) %a
   %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.>
@@ -1236,15 +1234,14 @@ define void @global_4xfloat(ptr addrspace(1) %a) {
 define void @global_2xdouble(ptr addrspace(1) %a) {
 ; CHECK-LABEL: global_2xdouble(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<2>;
-; CHECK-NEXT:    .reg .b64 %fd<5>;
+; CHECK-NEXT:    .reg .b64 %rd<6>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [global_2xdouble_param_0];
-; CHECK-NEXT:    ld.global.v2.b64 {%fd1, %fd2}, [%rd1];
-; CHECK-NEXT:    add.rn.f64 %fd3, %fd2, 0d3FF0000000000000;
-; CHECK-NEXT:    add.rn.f64 %fd4, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT:    st.global.v2.b64 [%rd1], {%fd4, %fd3};
+; CHECK-NEXT:    ld.global.v2.b64 {%rd2, %rd3}, [%rd1];
+; CHECK-NEXT:    add.rn.f64 %rd4, %rd3, 0d3FF0000000000000;
+; CHECK-NEXT:    add.rn.f64 %rd5, %rd2, 0d3FF0000000000000;
+; CHECK-NEXT:    st.global.v2.b64 [%rd1], {%rd5, %rd4};
 ; CHECK-NEXT:    ret;
   %a.load = load <2 x double>, ptr addrspace(1) %a
   %a.add = fadd <2 x double> %a.load, <double 1., double 1.>
@@ -1598,15 +1595,15 @@ define void @global_volatile_2xi64(ptr addrspace(1) %a) {
 define void @global_volatile_2xfloat(ptr addrspace(1) %a) {
 ; CHECK-LABEL: global_volatile_2xfloat(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [global_volatile_2xfloat_param_0];
-; CHECK-NEXT:    ld.volatile.global.v2.b32 {%f1, %f2}, [%rd1];
-; CHECK-NEXT:    add.rn.f32 %f3, %f2, 0f3F800000;
-; CHECK-NEXT:    add.rn.f32 %f4, %f1, 0f3F800000;
-; CHECK-NEXT:    st.volatile.global.v2.b32 [%rd1], {%f4, %f3};
+; CHECK-NEXT:    ld.volatile.global.v2.b32 {%r1, %r2}, [%rd1];
+; CHECK-NEXT:    add.rn.f32 %r3, %r2, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %r4, %r1, 0f3F800000;
+; CHECK-NEXT:    st.volatile.global.v2.b32 [%rd1], {%r4, %r3};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <2 x float>, ptr addrspace(1) %a
   %a.add = fadd <2 x float> %a.load, <float 1., float 1.>
@@ -1617,17 +1614,17 @@ define void @global_volatile_2xfloat(ptr addrspace(1) %a) {
 define void @global_volatile_4xfloat(ptr addrspace(1) %a) {
 ; CHECK-LABEL: global_volatile_4xfloat(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<9>;
+; CHECK-NEXT:    .reg .b32 %r<9>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [global_volatile_4xfloat_param_0];
-; CHECK-NEXT:    ld.volatile.global.v4.b32 {%f1, %f2, %f3, %f4}, [%rd1];
-; CHECK-NEXT:    add.rn.f32 %f5, %f4, 0f3F800000;
-; CHECK-NEXT:    add.rn.f32 %f6, %f3, 0f3F800000;
-; CHECK-NEXT:    add.rn.f32 %f7, %f2, 0f3F800000;
-; CHECK-NEXT:    add.rn.f32 %f8, %f1, 0f3F800000;
-; CHECK-NEXT:    st.volatile.global.v4.b32 [%rd1], {%f8, %f7, %f6, %f5};
+; CHECK-NEXT:    ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT:    add.rn.f32 %r5, %r4, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %r6, %r3, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %r7, %r2, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %r8, %r1, 0f3F800000;
+; CHECK-NEXT:    st.volatile.global.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <4 x float>, ptr addrspace(1) %a
   %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.>
@@ -1638,15 +1635,14 @@ define void @global_volatile_4xfloat(ptr addrspace(1) %a) {
 define void @global_volatile_2xdouble(ptr addrspace(1) %a) {
 ; CHECK-LABEL: global_volatile_2xdouble(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<2>;
-; CHECK-NEXT:    .reg .b64 %fd<5>;
+; CHECK-NEXT:    .reg .b64 %rd<6>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [global_volatile_2xdouble_param_0];
-; CHECK-NEXT:    ld.volatile.global.v2.b64 {%fd1, %fd2}, [%rd1];
-; CHECK-NEXT:    add.rn.f64 %fd3, %fd2, 0d3FF0000000000000;
-; CHECK-NEXT:    add.rn.f64 %fd4, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT:    st.volatile.global.v2.b64 [%rd1], {%fd4, %fd3};
+; CHECK-NEXT:    ld.volatile.global.v2.b64 {%rd2, %rd3}, [%rd1];
+; CHECK-NEXT:    add.rn.f64 %rd4, %rd3, 0d3FF0000000000000;
+; CHECK-NEXT:    add.rn.f64 %rd5, %rd2, 0d3FF0000000000000;
+; CHECK-NEXT:    st.volatile.global.v2.b64 [%rd1], {%rd5, %rd4};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <2 x double>, ptr addrspace(1) %a
   %a.add = fadd <2 x double> %a.load, <double 1., double 1.>
@@ -2002,15 +1998,15 @@ define void @shared_2xi64(ptr addrspace(3) %a) {
 define void @shared_2xfloat(ptr addrspace(3) %a) {
 ; CHECK-LABEL: shared_2xfloat(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [shared_2xfloat_param_0];
-; CHECK-NEXT:    ld.shared.v2.b32 {%f1, %f2}, [%rd1];
-; CHECK-NEXT:    add.rn.f32 %f3, %f2, 0f3F800000;
-; CHECK-NEXT:    add.rn.f32 %f4, %f1, 0f3F800000;
-; CHECK-NEXT:    st.shared.v2.b32 [%rd1], {%f4, %f3};
+; CHECK-NEXT:    ld.shared.v2.b32 {%r1, %r2}, [%rd1];
+; CHECK-NEXT:    add.rn.f32 %r3, %r2, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %r4, %r1, 0f3F800000;
+; CHECK-NEXT:    st.shared.v2.b32 [%rd1], {%r4, %r3};
 ; CHECK-NEXT:    ret;
   %a.load = load <2 x float>, ptr addrspace(3) %a
   %a.add = fadd <2 x float> %a.load, <float 1., float 1.>
@@ -2021,17 +2017,17 @@ define void @shared_2xfloat(ptr addrspace(3) %a) {
 define void @shared_4xfloat(ptr addrspace(3) %a) {
 ; CHECK-LABEL: shared_4xfloat(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<9>;
+; CHECK-NEXT:    .reg .b32 %r<9>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [shared_4xfloat_param_0];
-; CHECK-NEXT:    ld.shared.v4.b32 {%f1, %f2, %f3, %f4}, [%rd1];
-; CHECK-NEXT:    add.rn.f32 %f5, %f4, 0f3F800000;
-; CHECK-NEXT:    add.rn.f32 %f6, %f3, 0f3F800000;
-; CHECK-NEXT:    add.rn.f32 %f7, %f2, 0f3F800000;
-; CHECK-NEXT:    add.rn.f32 %f8, %f1, 0f3F800000;
-; CHECK-NEXT:    st.shared.v4.b32 [%rd1], {%f8, %f7, %f6, %f5};
+; CHECK-NEXT:    ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT:    add.rn.f32 %r5, %r4, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %r6, %r3, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %r7, %r2, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %r8, %r1, 0f3F800000;
+; CHECK-NEXT:    st.shared.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
 ; CHECK-NEXT:    ret;
   %a.load = load <4 x float>, ptr addrspace(3) %a
   %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.>
@@ -2042,15 +2038,14 @@ define void @shared_4xfloat(ptr addrspace(3) %a) {
 define void @shared_2xdouble(ptr addrspace(3) %a) {
 ; CHECK-LABEL: shared_2xdouble(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<2>;
-; CHECK-NEXT:    .reg .b64 %fd<5>;
+; CHECK-NEXT:    .reg .b64 %rd<6>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [shared_2xdouble_param_0];
-; CHECK-NEXT:    ld.shared.v2.b64 {%fd1, %fd2}, [%rd1];
-; CHECK-NEXT:    add.rn.f64 %fd3, %fd2, 0d3FF0000000000000;
-; CHECK-NEXT:    add.rn.f64 %fd4, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT:    st.shared.v2.b64 [%rd1], {%fd4, %fd3};
+; CHECK-NEXT:    ld.shared.v2.b64 {%rd2, %rd3}, [%rd1];
+; CHECK-NEXT:    add.rn.f64 %rd4, %rd3, 0d3FF0000000000000;
+; CHECK-NEXT:    add.rn.f64 %rd5, %rd2, 0d3FF0000000000000;
+; CHECK-NEXT:    st.shared.v2.b64 [%rd1], {%rd5, %rd4};
 ; CHECK-NEXT:    ret;
   %a.load = load <2 x double>, ptr addrspace(3) %a
   %a.add = fadd <2 x double> %a.load, <double 1., double 1.>
@@ -2404,15 +2399,15 @@ define void @shared_volatile_2xi64(ptr addrspace(3) %a) {
 define void @shared_volatile_2xfloat(ptr addrspace(3) %a) {
 ; CHECK-LABEL: shared_volatile_2xfloat(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [shared_volatile_2xfloat_param_0];
-; CHECK-NEXT:    ld.volatile.shared.v2.b32 {%f1, %f2}, [%rd1];
-; CHECK-NEXT:    add.rn.f32 %f3, %f2, 0f3F800000;
-; CHECK-NEXT:    add.rn.f32 %f4, %f1, 0f3F800000;
-; CHECK-NEXT:    st.volatile.shared.v2.b32 [%rd1], {%f4, %f3};
+; CHECK-NEXT:    ld.volatile.shared.v2.b32 {%r1, %r2}, [%rd1];
+; CHECK-NEXT:    add.rn.f32 %r3, %r2, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %r4, %r1, 0f3F800000;
+; CHECK-NEXT:    st.volatile.shared.v2.b32 [%rd1], {%r4, %r3};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <2 x float>, ptr addrspace(3) %a
   %a.add = fadd <2 x float> %a.load, <float 1., float 1.>
@@ -2423,17 +2418,17 @@ define void @shared_volatile_2xfloat(ptr addrspace(3) %a) {
 define void @shared_volatile_4xfloat(ptr addrspace(3) %a) {
 ; CHECK-LABEL: shared_volatile_4xfloat(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<9>;
+; CHECK-NEXT:    .reg .b32 %r<9>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [shared_volatile_4xfloat_param_0];
-; CHECK-NEXT:    ld.volatile.shared.v4.b32 {%f1, %f2, %f3, %f4}, [%rd1];
-; CHECK-NEXT:    add.rn.f32 %f5, %f4, 0f3F800000;
-; CHECK-NEXT:    add.rn.f32 %f6, %f3, 0f3F800000;
-; CHECK-NEXT:    add.rn.f32 %f7, %f2, 0f3F800000;
-; CHECK-NEXT:    add.rn.f32 %f8, %f1, 0f3F800000;
-; CHECK-NEXT:    st.volatile.shared.v4.b32 [%rd1], {%f8, %f7, %f6, %f5};
+; CHECK-NEXT:    ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT:    add.rn.f32 %r5, %r4, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %r6, %r3, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %r7, %r2, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %r8, %r1, 0f3F800000;
+; CHECK-NEXT:    st.volatile.shared.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <4 x float>, ptr addrspace(3) %a
   %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.>
@@ -2444,15 +2439,14 @@ define void @shared_volatile_4xfloat(ptr addrspace(3) %a) {
 define void @shared_volatile_2xdouble(ptr addrspace(3) %a) {
 ; CHECK-LABEL: shared_volatile_2xdouble(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<2>;
-; CHECK-NEXT:    .reg .b64 %fd<5>;
+; CHECK-NEXT:    .reg .b64 %rd<6>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [shared_volatile_2xdouble_param_0];
-; CHECK-NEXT:    ld.volatile.shared.v2.b64 {%fd1, %fd2}, [%rd1];
-; CHECK-NEXT:    add.rn.f64 %fd3, %fd2, 0d3FF0000000000000;
-; CHECK-NEXT:    add.rn.f64 %fd4, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT:    st.volatile.shared.v2.b64 [%rd1], {%fd4, %fd3};
+; CHECK-NEXT:    ld.volatile.shared.v2.b64 {%rd2, %rd3}, [%rd1];
+; CHECK-NEXT:    add.rn.f64 %rd4, %rd3, 0d3FF0000000000000;
+; CHECK-NEXT:    add.rn.f64 %rd5, %rd2, 0d3FF0000000000000;
+; CHECK-NEXT:    st.volatile.shared.v2.b64 [%rd1], {%rd5, %rd4};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <2 x double>, ptr addrspace(3) %a
   %a.add = fadd <2 x double> %a.load, <double 1., double 1.>
@@ -2808,15 +2802,15 @@ define void @local_2xi64(ptr addrspace(5) %a) {
 define void @local_2xfloat(ptr addrspace(5) %a) {
 ; CHECK-LABEL: local_2xfloat(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [local_2xfloat_param_0];
-; CHECK-NEXT:    ld.local.v2.b32 {%f1, %f2}, [%rd1];
-; CHECK-NEXT:    add.rn.f32 %f3, %f2, 0f3F800000;
-; CHECK-NEXT:    add.rn.f32 %f4, %f1, 0f3F800000;
-; CHECK-NEXT:    st.local.v2.b32 [%rd1], {%f4, %f3};
+; CHECK-NEXT:    ld.local.v2.b32 {%r1, %r2}, [%rd1];
+; CHECK-NEXT:    add.rn.f32 %r3, %r2, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %r4, %r1, 0f3F800000;
+; CHECK-NEXT:    st.local.v2.b32 [%rd1], {%r4, %r3};
 ; CHECK-NEXT:    ret;
   %a.load = load <2 x float>, ptr addrspace(5) %a
   %a.add = fadd <2 x float> %a.load, <float 1., float 1.>
@@ -2827,17 +2821,17 @@ define void @local_2xfloat(ptr addrspace(5) %a) {
 define void @local_4xfloat(ptr addrspace(5) %a) {
 ; CHECK-LABEL: local_4xfloat(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<9>;
+; CHECK-NEXT:    .reg .b32 %r<9>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [local_4xfloat_param_0];
-; CHECK-NEXT:    ld.local.v4.b32 {%f1, %f2, %f3, %f4}, [%rd1];
-; CHECK-NEXT:    add.rn.f32 %f5, %f4, 0f3F800000;
-; CHECK-NEXT:    add.rn.f32 %f6, %f3, 0f3F800000;
-; CHECK-NEXT:    add.rn.f32 %f7, %f2, 0f3F800000;
-; CHECK-NEXT:    add.rn.f32 %f8, %f1, 0f3F800000;
-; CHECK-NEXT:    st.local.v4.b32 [%rd1], {%f8, %f7, %f6, %f5};
+; CHECK-NEXT:    ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT:    add.rn.f32 %r5, %r4, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %r6, %r3, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %r7, %r2, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %r8, %r1, 0f3F800000;
+; CHECK-NEXT:    st.local.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
 ; CHECK-NEXT:    ret;
   %a.load = load <4 x float>, ptr addrspace(5) %a
   %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.>
@@ -2848,15 +2842,14 @@ define void @local_4xfloat(ptr addrspace(5) %a) {
 define void @local_2xdouble(ptr addrspace(5) %a) {
 ; CHECK-LABEL: local_2xdouble(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<2>;
-; CHECK-NEXT:    .reg .b64 %fd<5>;
+; CHECK-NEXT:    .reg .b64 %rd<6>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [local_2xdouble_param_0];
-; CHECK-NEXT:    ld.local.v2.b64 {%fd1, %fd2}, [%rd1];
-; CHECK-NEXT:    add.rn.f64 %fd3, %fd2, 0d3FF0000000000000;
-; CHECK-NEXT:    add.rn.f64 %fd4, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT:    st.local.v2.b64 [%rd1], {%fd4, %fd3};
+; CHECK-NEXT:    ld.local.v2.b64 {%rd2, %rd3}, [%rd1];
+; CHECK-NEXT:    add.rn.f64 %rd4, %rd3, 0d3FF0000000000000;
+; CHECK-NEXT:    add.rn.f64 %rd5, %rd2, 0d3FF0000000000000;
+; CHECK-NEXT:    st.local.v2.b64 [%rd1], {%rd5, %rd4};
 ; CHECK-NEXT:    ret;
   %a.load = load <2 x double>, ptr addrspace(5) %a
   %a.add = fadd <2 x double> %a.load, <double 1., double 1.>
@@ -3210,15 +3203,15 @@ define void @local_volatile_2xi64(ptr addrspace(5) %a) {
 define void @local_volatile_2xfloat(ptr addrspace(5) %a) {
 ; CHECK-LABEL: local_volatile_2xfloat(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [local_volatile_2xfloat_param_0];
-; CHECK-NEXT:    ld.local.v2.b32 {%f1, %f2}, [%rd1];
-; CHECK-NEXT:    add.rn.f32 %f3, %f2, 0f3F800000;
-; CHECK-NEXT:    add.rn.f32 %f4, %f1, 0f3F800000;
-; CHECK-NEXT:    st.local.v2.b32 [%rd1], {%f4, %f3};
+; CHECK-NEXT:    ld.local.v2.b32 {%r1, %r2}, [%rd1];
+; CHECK-NEXT:    add.rn.f32 %r3, %r2, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %r4, %r1, 0f3F800000;
+; CHECK-NEXT:    st.local.v2.b32 [%rd1], {%r4, %r3};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <2 x float>, ptr addrspace(5) %a
   %a.add = fadd <2 x float> %a.load, <float 1., float 1.>
@@ -3229,17 +3222,17 @@ define void @local_volatile_2xfloat(ptr addrspace(5) %a) {
 define void @local_volatile_4xfloat(ptr addrspace(5) %a) {
 ; CHECK-LABEL: local_volatile_4xfloat(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<9>;
+; CHECK-NEXT:    .reg .b32 %r<9>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [local_volatile_4xfloat_param_0];
-; CHECK-NEXT:    ld.local.v4.b32 {%f1, %f2, %f3, %f4}, [%rd1];
-; CHECK-NEXT:    add.rn.f32 %f5, %f4, 0f3F800000;
-; CHECK-NEXT:    add.rn.f32 %f6, %f3, 0f3F800000;
-; CHECK-NEXT:    add.rn.f32 %f7, %f2, 0f3F800000;
-; CHECK-NEXT:    add.rn.f32 %f8, %f1, 0f3F800000;
-; CHECK-NEXT:    st.local.v4.b32 [%rd1], {%f8, %f7, %f6, %f5};
+; CHECK-NEXT:    ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT:    add.rn.f32 %r5, %r4, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %r6, %r3, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %r7, %r2, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %r8, %r1, 0f3F800000;
+; CHECK-NEXT:    st.local.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <4 x float>, ptr addrspace(5) %a
   %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.>
@@ -3250,15 +3243,14 @@ define void @local_volatile_4xfloat(ptr addrspace(5) %a) {
 define void @local_volatile_2xdouble(ptr addrspace(5) %a) {
 ; CHECK-LABEL: local_volatile_2xdouble(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<2>;
-; CHECK-NEXT:    .reg .b64 %fd<5>;
+; CHECK-NEXT:    .reg .b64 %rd<6>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [local_volatile_2xdouble_param_0];
-; CHECK-NEXT:    ld.local.v2.b64 {%fd1, %fd2}, [%rd1];
-; CHECK-NEXT:    add.rn.f64 %fd3, %fd2, 0d3FF0000000000000;
-; CHECK-NEXT:    add.rn.f64 %fd4, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT:    st.local.v2.b64 [%rd1], {%fd4, %fd3};
+; CHECK-NEXT:    ld.local.v2.b64 {%rd2, %rd3}, [%rd1];
+; CHECK-NEXT:    add.rn.f64 %rd4, %rd3, 0d3FF0000000000000;
+; CHECK-NEXT:    add.rn.f64 %rd5, %rd2, 0d3FF0000000000000;
+; CHECK-NEXT:    st.local.v2.b64 [%rd1], {%rd5, %rd4};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <2 x double>, ptr addrspace(5) %a
   %a.add = fadd <2 x double> %a.load, <double 1., double 1.>
diff --git a/llvm/test/CodeGen/NVPTX/math-intrins.ll b/llvm/test/CodeGen/NVPTX/math-intrins.ll
index c7c1ea84f9a39..e72316ad47136 100644
--- a/llvm/test/CodeGen/NVPTX/math-intrins.ll
+++ b/llvm/test/CodeGen/NVPTX/math-intrins.ll
@@ -50,12 +50,12 @@ declare double @llvm.fma.f64(double, double, double) #0
 define float @ceil_float(float %a) {
 ; CHECK-LABEL: ceil_float(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [ceil_float_param_0];
-; CHECK-NEXT:    cvt.rpi.f32.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-NEXT:    ld.param.b32 %r1, [ceil_float_param_0];
+; CHECK-NEXT:    cvt.rpi.f32.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %b = call float @llvm.ceil.f32(float %a)
   ret float %b
@@ -64,12 +64,12 @@ define float @ceil_float(float %a) {
 define float @ceil_float_ftz(float %a) #1 {
 ; CHECK-LABEL: ceil_float_ftz(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [ceil_float_ftz_param_0];
-; CHECK-NEXT:    cvt.rpi.ftz.f32.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-NEXT:    ld.param.b32 %r1, [ceil_float_ftz_param_0];
+; CHECK-NEXT:    cvt.rpi.ftz.f32.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %b = call float @llvm.ceil.f32(float %a)
   ret float %b
@@ -78,12 +78,12 @@ define float @ceil_float_ftz(float %a) #1 {
 define double @ceil_double(double %a) {
 ; CHECK-LABEL: ceil_double(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %fd<3>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b64 %fd1, [ceil_double_param_0];
-; CHECK-NEXT:    cvt.rpi.f64.f64 %fd2, %fd1;
-; CHECK-NEXT:    st.param.b64 [func_retval0], %fd2;
+; CHECK-NEXT:    ld.param.b64 %rd1, [ceil_double_param_0];
+; CHECK-NEXT:    cvt.rpi.f64.f64 %rd2, %rd1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
 ; CHECK-NEXT:    ret;
   %b = call double @llvm.ceil.f64(double %a)
   ret double %b
@@ -94,12 +94,12 @@ define double @ceil_double(double %a) {
 define float @floor_float(float %a) {
 ; CHECK-LABEL: floor_float(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [floor_float_param_0];
-; CHECK-NEXT:    cvt.rmi.f32.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-NEXT:    ld.param.b32 %r1, [floor_float_param_0];
+; CHECK-NEXT:    cvt.rmi.f32.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %b = call float @llvm.floor.f32(float %a)
   ret float %b
@@ -108,12 +108,12 @@ define float @floor_float(float %a) {
 define float @floor_float_ftz(float %a) #1 {
 ; CHECK-LABEL: floor_float_ftz(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [floor_float_ftz_param_0];
-; CHECK-NEXT:    cvt.rmi.ftz.f32.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-NEXT:    ld.param.b32 %r1, [floor_float_ftz_param_0];
+; CHECK-NEXT:    cvt.rmi.ftz.f32.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %b = call float @llvm.floor.f32(float %a)
   ret float %b
@@ -122,12 +122,12 @@ define float @floor_float_ftz(float %a) #1 {
 define double @floor_double(double %a) {
 ; CHECK-LABEL: floor_double(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %fd<3>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b64 %fd1, [floor_double_param_0];
-; CHECK-NEXT:    cvt.rmi.f64.f64 %fd2, %fd1;
-; CHECK-NEXT:    st.param.b64 [func_retval0], %fd2;
+; CHECK-NEXT:    ld.param.b64 %rd1, [floor_double_param_0];
+; CHECK-NEXT:    cvt.rmi.f64.f64 %rd2, %rd1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
 ; CHECK-NEXT:    ret;
   %b = call double @llvm.floor.f64(double %a)
   ret double %b
@@ -140,24 +140,21 @@ define float @round_float(float %a) {
 ; CHECK-LABEL: round_float(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<3>;
-; CHECK-NEXT:    .reg .b32 %r<4>;
-; CHECK-NEXT:    .reg .b32 %f<9>;
+; CHECK-NEXT:    .reg .b32 %r<14>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [round_float_param_0];
-; CHECK-NEXT:    mov.b32 %r1, %f1;
-; CHECK-NEXT:    and.b32 %r2, %r1, -2147483648;
-; CHECK-NEXT:    or.b32 %r3, %r2, 1056964608;
-; CHECK-NEXT:    mov.b32 %f2, %r3;
-; CHECK-NEXT:    add.rn.f32 %f3, %f1, %f2;
-; CHECK-NEXT:    cvt.rzi.f32.f32 %f4, %f3;
-; CHECK-NEXT:    abs.f32 %f5, %f1;
-; CHECK-NEXT:    setp.gt.f32 %p1, %f5, 0f4B000000;
-; CHECK-NEXT:    selp.f32 %f6, %f1, %f4, %p1;
-; CHECK-NEXT:    cvt.rzi.f32.f32 %f7, %f1;
-; CHECK-NEXT:    setp.lt.f32 %p2, %f5, 0f3F000000;
-; CHECK-NEXT:    selp.f32 %f8, %f7, %f6, %p2;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f8;
+; CHECK-NEXT:    ld.param.b32 %r12, [round_float_param_0];
+; CHECK-NEXT:    and.b32 %r3, %r12, -2147483648;
+; CHECK-NEXT:    or.b32 %r13, %r3, 1056964608;
+; CHECK-NEXT:    add.rn.f32 %r6, %r12, %r13;
+; CHECK-NEXT:    cvt.rzi.f32.f32 %r7, %r6;
+; CHECK-NEXT:    abs.f32 %r8, %r12;
+; CHECK-NEXT:    setp.gt.f32 %p1, %r8, 0f4B000000;
+; CHECK-NEXT:    selp.f32 %r9, %r12, %r7, %p1;
+; CHECK-NEXT:    cvt.rzi.f32.f32 %r10, %r12;
+; CHECK-NEXT:    setp.lt.f32 %p2, %r8, 0f3F000000;
+; CHECK-NEXT:    selp.f32 %r11, %r10, %r9, %p2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r11;
 ; CHECK-NEXT:    ret;
   %b = call float @llvm.round.f32(float %a)
   ret float %b
@@ -168,24 +165,21 @@ define float @round_float_ftz(float %a) #1 {
 ; CHECK-LABEL: round_float_ftz(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<3>;
-; CHECK-NEXT:    .reg .b32 %r<4>;
-; CHECK-NEXT:    .reg .b32 %f<9>;
+; CHECK-NEXT:    .reg .b32 %r<14>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [round_float_ftz_param_0];
-; CHECK-NEXT:    mov.b32 %r1, %f1;
-; CHECK-NEXT:    and.b32 %r2, %r1, -2147483648;
-; CHECK-NEXT:    or.b32 %r3, %r2, 1056964608;
-; CHECK-NEXT:    mov.b32 %f2, %r3;
-; CHECK-NEXT:    add.rn.ftz.f32 %f3, %f1, %f2;
-; CHECK-NEXT:    cvt.rzi.ftz.f32.f32 %f4, %f3;
-; CHECK-NEXT:    abs.ftz.f32 %f5, %f1;
-; CHECK-NEXT:    setp.gt.ftz.f32 %p1, %f5, 0f4B000000;
-; CHECK-NEXT:    selp.f32 %f6, %f1, %f4, %p1;
-; CHECK-NEXT:    cvt.rzi.ftz.f32.f32 %f7, %f1;
-; CHECK-NEXT:    setp.lt.ftz.f32 %p2, %f5, 0f3F000000;
-; CHECK-NEXT:    selp.f32 %f8, %f7, %f6, %p2;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f8;
+; CHECK-NEXT:    ld.param.b32 %r12, [round_float_ftz_param_0];
+; CHECK-NEXT:    and.b32 %r3, %r12, -2147483648;
+; CHECK-NEXT:    or.b32 %r13, %r3, 1056964608;
+; CHECK-NEXT:    add.rn.ftz.f32 %r6, %r12, %r13;
+; CHECK-NEXT:    cvt.rzi.ftz.f32.f32 %r7, %r6;
+; CHECK-NEXT:    abs.ftz.f32 %r8, %r12;
+; CHECK-NEXT:    setp.gt.ftz.f32 %p1, %r8, 0f4B000000;
+; CHECK-NEXT:    selp.f32 %r9, %r12, %r7, %p1;
+; CHECK-NEXT:    cvt.rzi.ftz.f32.f32 %r10, %r12;
+; CHECK-NEXT:    setp.lt.ftz.f32 %p2, %r8, 0f3F000000;
+; CHECK-NEXT:    selp.f32 %r11, %r10, %r9, %p2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r11;
 ; CHECK-NEXT:    ret;
   %b = call float @llvm.round.f32(float %a)
   ret float %b
@@ -196,19 +190,19 @@ define double @round_double(double %a) {
 ; CHECK-LABEL: round_double(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<3>;
-; CHECK-NEXT:    .reg .b64 %fd<8>;
+; CHECK-NEXT:    .reg .b64 %rd<8>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b64 %fd1, [round_double_param_0];
-; CHECK-NEXT:    abs.f64 %fd2, %fd1;
-; CHECK-NEXT:    setp.lt.f64 %p1, %fd2, 0d3FE0000000000000;
-; CHECK-NEXT:    add.rn.f64 %fd3, %fd2, 0d3FE0000000000000;
-; CHECK-NEXT:    cvt.rzi.f64.f64 %fd4, %fd3;
-; CHECK-NEXT:    selp.f64 %fd5, 0d0000000000000000, %fd4, %p1;
-; CHECK-NEXT:    copysign.f64 %fd6, %fd1, %fd5;
-; CHECK-NEXT:    setp.gt.f64 %p2, %fd2, 0d4330000000000000;
-; CHECK-NEXT:    selp.f64 %fd7, %fd1, %fd6, %p2;
-; CHECK-NEXT:    st.param.b64 [func_retval0], %fd7;
+; CHECK-NEXT:    ld.param.b64 %rd1, [round_double_param_0];
+; CHECK-NEXT:    abs.f64 %rd2, %rd1;
+; CHECK-NEXT:    setp.lt.f64 %p1, %rd2, 0d3FE0000000000000;
+; CHECK-NEXT:    add.rn.f64 %rd3, %rd2, 0d3FE0000000000000;
+; CHECK-NEXT:    cvt.rzi.f64.f64 %rd4, %rd3;
+; CHECK-NEXT:    selp.f64 %rd5, 0d0000000000000000, %rd4, %p1;
+; CHECK-NEXT:    copysign.f64 %rd6, %rd1, %rd5;
+; CHECK-NEXT:    setp.gt.f64 %p2, %rd2, 0d4330000000000000;
+; CHECK-NEXT:    selp.f64 %rd7, %rd1, %rd6, %p2;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd7;
 ; CHECK-NEXT:    ret;
   %b = call double @llvm.round.f64(double %a)
   ret double %b
@@ -219,12 +213,12 @@ define double @round_double(double %a) {
 define float @nearbyint_float(float %a) {
 ; CHECK-LABEL: nearbyint_float(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [nearbyint_float_param_0];
-; CHECK-NEXT:    cvt.rni.f32.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-NEXT:    ld.param.b32 %r1, [nearbyint_float_param_0];
+; CHECK-NEXT:    cvt.rni.f32.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %b = call float @llvm.nearbyint.f32(float %a)
   ret float %b
@@ -233,12 +227,12 @@ define float @nearbyint_float(float %a) {
 define float @nearbyint_float_ftz(float %a) #1 {
 ; CHECK-LABEL: nearbyint_float_ftz(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [nearbyint_float_ftz_param_0];
-; CHECK-NEXT:    cvt.rni.ftz.f32.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-NEXT:    ld.param.b32 %r1, [nearbyint_float_ftz_param_0];
+; CHECK-NEXT:    cvt.rni.ftz.f32.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %b = call float @llvm.nearbyint.f32(float %a)
   ret float %b
@@ -247,12 +241,12 @@ define float @nearbyint_float_ftz(float %a) #1 {
 define double @nearbyint_double(double %a) {
 ; CHECK-LABEL: nearbyint_double(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %fd<3>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b64 %fd1, [nearbyint_double_param_0];
-; CHECK-NEXT:    cvt.rni.f64.f64 %fd2, %fd1;
-; CHECK-NEXT:    st.param.b64 [func_retval0], %fd2;
+; CHECK-NEXT:    ld.param.b64 %rd1, [nearbyint_double_param_0];
+; CHECK-NEXT:    cvt.rni.f64.f64 %rd2, %rd1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
 ; CHECK-NEXT:    ret;
   %b = call double @llvm.nearbyint.f64(double %a)
   ret double %b
@@ -263,12 +257,12 @@ define double @nearbyint_double(double %a) {
 define float @rint_float(float %a) {
 ; CHECK-LABEL: rint_float(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [rint_float_param_0];
-; CHECK-NEXT:    cvt.rni.f32.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-NEXT:    ld.param.b32 %r1, [rint_float_param_0];
+; CHECK-NEXT:    cvt.rni.f32.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %b = call float @llvm.rint.f32(float %a)
   ret float %b
@@ -277,12 +271,12 @@ define float @rint_float(float %a) {
 define float @rint_float_ftz(float %a) #1 {
 ; CHECK-LABEL: rint_float_ftz(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [rint_float_ftz_param_0];
-; CHECK-NEXT:    cvt.rni.ftz.f32.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-NEXT:    ld.param.b32 %r1, [rint_float_ftz_param_0];
+; CHECK-NEXT:    cvt.rni.ftz.f32.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %b = call float @llvm.rint.f32(float %a)
   ret float %b
@@ -291,12 +285,12 @@ define float @rint_float_ftz(float %a) #1 {
 define double @rint_double(double %a) {
 ; CHECK-LABEL: rint_double(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %fd<3>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b64 %fd1, [rint_double_param_0];
-; CHECK-NEXT:    cvt.rni.f64.f64 %fd2, %fd1;
-; CHECK-NEXT:    st.param.b64 [func_retval0], %fd2;
+; CHECK-NEXT:    ld.param.b64 %rd1, [rint_double_param_0];
+; CHECK-NEXT:    cvt.rni.f64.f64 %rd2, %rd1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
 ; CHECK-NEXT:    ret;
   %b = call double @llvm.rint.f64(double %a)
   ret double %b
@@ -307,12 +301,12 @@ define double @rint_double(double %a) {
 define float @roundeven_float(float %a) {
 ; CHECK-LABEL: roundeven_float(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [roundeven_float_param_0];
-; CHECK-NEXT:    cvt.rni.f32.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-NEXT:    ld.param.b32 %r1, [roundeven_float_param_0];
+; CHECK-NEXT:    cvt.rni.f32.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %b = call float @llvm.roundeven.f32(float %a)
   ret float %b
@@ -321,12 +315,12 @@ define float @roundeven_float(float %a) {
 define float @roundeven_float_ftz(float %a) #1 {
 ; CHECK-LABEL: roundeven_float_ftz(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [roundeven_float_ftz_param_0];
-; CHECK-NEXT:    cvt.rni.ftz.f32.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-NEXT:    ld.param.b32 %r1, [roundeven_float_ftz_param_0];
+; CHECK-NEXT:    cvt.rni.ftz.f32.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %b = call float @llvm.roundeven.f32(float %a)
   ret float %b
@@ -335,12 +329,12 @@ define float @roundeven_float_ftz(float %a) #1 {
 define double @roundeven_double(double %a) {
 ; CHECK-LABEL: roundeven_double(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %fd<3>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b64 %fd1, [roundeven_double_param_0];
-; CHECK-NEXT:    cvt.rni.f64.f64 %fd2, %fd1;
-; CHECK-NEXT:    st.param.b64 [func_retval0], %fd2;
+; CHECK-NEXT:    ld.param.b64 %rd1, [roundeven_double_param_0];
+; CHECK-NEXT:    cvt.rni.f64.f64 %rd2, %rd1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
 ; CHECK-NEXT:    ret;
   %b = call double @llvm.roundeven.f64(double %a)
   ret double %b
@@ -351,12 +345,12 @@ define double @roundeven_double(double %a) {
 define float @trunc_float(float %a) {
 ; CHECK-LABEL: trunc_float(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [trunc_float_param_0];
-; CHECK-NEXT:    cvt.rzi.f32.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-NEXT:    ld.param.b32 %r1, [trunc_float_param_0];
+; CHECK-NEXT:    cvt.rzi.f32.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %b = call float @llvm.trunc.f32(float %a)
   ret float %b
@@ -365,12 +359,12 @@ define float @trunc_float(float %a) {
 define float @trunc_float_ftz(float %a) #1 {
 ; CHECK-LABEL: trunc_float_ftz(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [trunc_float_ftz_param_0];
-; CHECK-NEXT:    cvt.rzi.ftz.f32.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-NEXT:    ld.param.b32 %r1, [trunc_float_ftz_param_0];
+; CHECK-NEXT:    cvt.rzi.ftz.f32.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %b = call float @llvm.trunc.f32(float %a)
   ret float %b
@@ -379,12 +373,12 @@ define float @trunc_float_ftz(float %a) #1 {
 define double @trunc_double(double %a) {
 ; CHECK-LABEL: trunc_double(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %fd<3>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b64 %fd1, [trunc_double_param_0];
-; CHECK-NEXT:    cvt.rzi.f64.f64 %fd2, %fd1;
-; CHECK-NEXT:    st.param.b64 [func_retval0], %fd2;
+; CHECK-NEXT:    ld.param.b64 %rd1, [trunc_double_param_0];
+; CHECK-NEXT:    cvt.rzi.f64.f64 %rd2, %rd1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
 ; CHECK-NEXT:    ret;
   %b = call double @llvm.trunc.f64(double %a)
   ret double %b
@@ -395,12 +389,12 @@ define double @trunc_double(double %a) {
 define float @abs_float(float %a) {
 ; CHECK-LABEL: abs_float(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [abs_float_param_0];
-; CHECK-NEXT:    abs.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-NEXT:    ld.param.b32 %r1, [abs_float_param_0];
+; CHECK-NEXT:    abs.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %b = call float @llvm.fabs.f32(float %a)
   ret float %b
@@ -409,12 +403,12 @@ define float @abs_float(float %a) {
 define float @abs_float_ftz(float %a) #1 {
 ; CHECK-LABEL: abs_float_ftz(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [abs_float_ftz_param_0];
-; CHECK-NEXT:    abs.ftz.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-NEXT:    ld.param.b32 %r1, [abs_float_ftz_param_0];
+; CHECK-NEXT:    abs.ftz.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %b = call float @llvm.fabs.f32(float %a)
   ret float %b
@@ -423,12 +417,12 @@ define float @abs_float_ftz(float %a) #1 {
 define double @abs_double(double %a) {
 ; CHECK-LABEL: abs_double(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %fd<3>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b64 %fd1, [abs_double_param_0];
-; CHECK-NEXT:    abs.f64 %fd2, %fd1;
-; CHECK-NEXT:    st.param.b64 [func_retval0], %fd2;
+; CHECK-NEXT:    ld.param.b64 %rd1, [abs_double_param_0];
+; CHECK-NEXT:    abs.f64 %rd2, %rd1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
 ; CHECK-NEXT:    ret;
   %b = call double @llvm.fabs.f64(double %a)
   ret double %b
@@ -440,15 +434,15 @@ define half @minnum_half(half %a, half %b) {
 ; CHECK-NOF16-LABEL: minnum_half(
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .b16 %rs<4>;
-; CHECK-NOF16-NEXT:    .reg .b32 %f<4>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<4>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
 ; CHECK-NOF16-NEXT:    ld.param.b16 %rs1, [minnum_half_param_0];
 ; CHECK-NOF16-NEXT:    ld.param.b16 %rs2, [minnum_half_param_1];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs1;
-; CHECK-NOF16-NEXT:    min.f32 %f3, %f2, %f1;
-; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs3, %f3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r1, %rs2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r2, %rs1;
+; CHECK-NOF16-NEXT:    min.f32 %r3, %r2, %r1;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs3, %r3;
 ; CHECK-NOF16-NEXT:    st.param.b16 [func_retval0], %rs3;
 ; CHECK-NOF16-NEXT:    ret;
 ;
@@ -466,15 +460,15 @@ define half @minnum_half(half %a, half %b) {
 ; CHECK-SM80-NOF16-LABEL: minnum_half(
 ; CHECK-SM80-NOF16:       {
 ; CHECK-SM80-NOF16-NEXT:    .reg .b16 %rs<4>;
-; CHECK-SM80-NOF16-NEXT:    .reg .b32 %f<4>;
+; CHECK-SM80-NOF16-NEXT:    .reg .b32 %r<4>;
 ; CHECK-SM80-NOF16-EMPTY:
 ; CHECK-SM80-NOF16-NEXT:  // %bb.0:
 ; CHECK-SM80-NOF16-NEXT:    ld.param.b16 %rs1, [minnum_half_param_0];
 ; CHECK-SM80-NOF16-NEXT:    ld.param.b16 %rs2, [minnum_half_param_1];
-; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
-; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f2, %rs1;
-; CHECK-SM80-NOF16-NEXT:    min.f32 %f3, %f2, %f1;
-; CHECK-SM80-NOF16-NEXT:    cvt.rn.f16.f32 %rs3, %f3;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %r1, %rs2;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %r2, %rs1;
+; CHECK-SM80-NOF16-NEXT:    min.f32 %r3, %r2, %r1;
+; CHECK-SM80-NOF16-NEXT:    cvt.rn.f16.f32 %rs3, %r3;
 ; CHECK-SM80-NOF16-NEXT:    st.param.b16 [func_retval0], %rs3;
 ; CHECK-SM80-NOF16-NEXT:    ret;
   %x = call half @llvm.minnum.f16(half %a, half %b)
@@ -484,13 +478,13 @@ define half @minnum_half(half %a, half %b) {
 define float @minnum_float(float %a, float %b) {
 ; CHECK-LABEL: minnum_float(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<4>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [minnum_float_param_0];
-; CHECK-NEXT:    ld.param.b32 %f2, [minnum_float_param_1];
-; CHECK-NEXT:    min.f32 %f3, %f1, %f2;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f3;
+; CHECK-NEXT:    ld.param.b32 %r1, [minnum_float_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [minnum_float_param_1];
+; CHECK-NEXT:    min.f32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
   %x = call float @llvm.minnum.f32(float %a, float %b)
   ret float %x
@@ -499,12 +493,12 @@ define float @minnum_float(float %a, float %b) {
 define float @minnum_imm1(float %a) {
 ; CHECK-LABEL: minnum_imm1(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [minnum_imm1_param_0];
-; CHECK-NEXT:    min.f32 %f2, %f1, 0f00000000;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-NEXT:    ld.param.b32 %r1, [minnum_imm1_param_0];
+; CHECK-NEXT:    min.f32 %r2, %r1, 0f00000000;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %x = call float @llvm.minnum.f32(float %a, float 0.0)
   ret float %x
@@ -513,12 +507,12 @@ define float @minnum_imm1(float %a) {
 define float @minnum_imm2(float %a) {
 ; CHECK-LABEL: minnum_imm2(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [minnum_imm2_param_0];
-; CHECK-NEXT:    min.f32 %f2, %f1, 0f00000000;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-NEXT:    ld.param.b32 %r1, [minnum_imm2_param_0];
+; CHECK-NEXT:    min.f32 %r2, %r1, 0f00000000;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %x = call float @llvm.minnum.f32(float 0.0, float %a)
   ret float %x
@@ -527,13 +521,13 @@ define float @minnum_imm2(float %a) {
 define float @minnum_float_ftz(float %a, float %b) #1 {
 ; CHECK-LABEL: minnum_float_ftz(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<4>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [minnum_float_ftz_param_0];
-; CHECK-NEXT:    ld.param.b32 %f2, [minnum_float_ftz_param_1];
-; CHECK-NEXT:    min.ftz.f32 %f3, %f1, %f2;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f3;
+; CHECK-NEXT:    ld.param.b32 %r1, [minnum_float_ftz_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [minnum_float_ftz_param_1];
+; CHECK-NEXT:    min.ftz.f32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
   %x = call float @llvm.minnum.f32(float %a, float %b)
   ret float %x
@@ -542,13 +536,13 @@ define float @minnum_float_ftz(float %a, float %b) #1 {
 define double @minnum_double(double %a, double %b) {
 ; CHECK-LABEL: minnum_double(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %fd<4>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b64 %fd1, [minnum_double_param_0];
-; CHECK-NEXT:    ld.param.b64 %fd2, [minnum_double_param_1];
-; CHECK-NEXT:    min.f64 %fd3, %fd1, %fd2;
-; CHECK-NEXT:    st.param.b64 [func_retval0], %fd3;
+; CHECK-NEXT:    ld.param.b64 %rd1, [minnum_double_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [minnum_double_param_1];
+; CHECK-NEXT:    min.f64 %rd3, %rd1, %rd2;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; CHECK-NEXT:    ret;
   %x = call double @llvm.minnum.f64(double %a, double %b)
   ret double %x
@@ -558,24 +552,23 @@ define <2 x half> @minnum_v2half(<2 x half> %a, <2 x half> %b) {
 ; CHECK-NOF16-LABEL: minnum_v2half(
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .b16 %rs<7>;
-; CHECK-NOF16-NEXT:    .reg .b32 %r<4>;
-; CHECK-NOF16-NEXT:    .reg .b32 %f<7>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<10>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [minnum_v2half_param_0];
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [minnum_v2half_param_1];
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
-; CHECK-NOF16-NEXT:    min.f32 %f3, %f2, %f1;
-; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs5, %f3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f5, %rs3;
-; CHECK-NOF16-NEXT:    min.f32 %f6, %f5, %f4;
-; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs6, %f6;
-; CHECK-NOF16-NEXT:    mov.b32 %r3, {%rs6, %rs5};
-; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-NOF16-NEXT:    min.f32 %r5, %r4, %r3;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs5, %r5;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r7, %rs3;
+; CHECK-NOF16-NEXT:    min.f32 %r8, %r7, %r6;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs6, %r8;
+; CHECK-NOF16-NEXT:    mov.b32 %r9, {%rs6, %rs5};
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r9;
 ; CHECK-NOF16-NEXT:    ret;
 ;
 ; CHECK-F16-LABEL: minnum_v2half(
@@ -592,24 +585,23 @@ define <2 x half> @minnum_v2half(<2 x half> %a, <2 x half> %b) {
 ; CHECK-SM80-NOF16-LABEL: minnum_v2half(
 ; CHECK-SM80-NOF16:       {
 ; CHECK-SM80-NOF16-NEXT:    .reg .b16 %rs<7>;
-; CHECK-SM80-NOF16-NEXT:    .reg .b32 %r<4>;
-; CHECK-SM80-NOF16-NEXT:    .reg .b32 %f<7>;
+; CHECK-SM80-NOF16-NEXT:    .reg .b32 %r<10>;
 ; CHECK-SM80-NOF16-EMPTY:
 ; CHECK-SM80-NOF16-NEXT:  // %bb.0:
 ; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %r1, [minnum_v2half_param_0];
 ; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %r2, [minnum_v2half_param_1];
 ; CHECK-SM80-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
 ; CHECK-SM80-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
-; CHECK-SM80-NOF16-NEXT:    min.f32 %f3, %f2, %f1;
-; CHECK-SM80-NOF16-NEXT:    cvt.rn.f16.f32 %rs5, %f3;
-; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f4, %rs1;
-; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f5, %rs3;
-; CHECK-SM80-NOF16-NEXT:    min.f32 %f6, %f5, %f4;
-; CHECK-SM80-NOF16-NEXT:    cvt.rn.f16.f32 %rs6, %f6;
-; CHECK-SM80-NOF16-NEXT:    mov.b32 %r3, {%rs6, %rs5};
-; CHECK-SM80-NOF16-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-SM80-NOF16-NEXT:    min.f32 %r5, %r4, %r3;
+; CHECK-SM80-NOF16-NEXT:    cvt.rn.f16.f32 %rs5, %r5;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %r7, %rs3;
+; CHECK-SM80-NOF16-NEXT:    min.f32 %r8, %r7, %r6;
+; CHECK-SM80-NOF16-NEXT:    cvt.rn.f16.f32 %rs6, %r8;
+; CHECK-SM80-NOF16-NEXT:    mov.b32 %r9, {%rs6, %rs5};
+; CHECK-SM80-NOF16-NEXT:    st.param.b32 [func_retval0], %r9;
 ; CHECK-SM80-NOF16-NEXT:    ret;
   %x = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
   ret <2 x half> %x
@@ -622,23 +614,23 @@ define half @minimum_half(half %a, half %b) {
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .pred %p<6>;
 ; CHECK-NOF16-NEXT:    .reg .b16 %rs<8>;
-; CHECK-NOF16-NEXT:    .reg .b32 %f<4>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<4>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
 ; CHECK-NOF16-NEXT:    ld.param.b16 %rs1, [minimum_half_param_0];
 ; CHECK-NOF16-NEXT:    ld.param.b16 %rs2, [minimum_half_param_1];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs1;
-; CHECK-NOF16-NEXT:    setp.lt.f32 %p1, %f2, %f1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r1, %rs2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r2, %rs1;
+; CHECK-NOF16-NEXT:    setp.lt.f32 %p1, %r2, %r1;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs3, %rs1, %rs2, %p1;
-; CHECK-NOF16-NEXT:    setp.nan.f32 %p2, %f2, %f1;
+; CHECK-NOF16-NEXT:    setp.nan.f32 %p2, %r2, %r1;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs4, 0x7E00, %rs3, %p2;
 ; CHECK-NOF16-NEXT:    setp.eq.s16 %p3, %rs1, -32768;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, %rs1, %rs4, %p3;
 ; CHECK-NOF16-NEXT:    setp.eq.s16 %p4, %rs2, -32768;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs6, %rs2, %rs5, %p4;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs4;
-; CHECK-NOF16-NEXT:    setp.eq.f32 %p5, %f3, 0f00000000;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
+; CHECK-NOF16-NEXT:    setp.eq.f32 %p5, %r3, 0f00000000;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs7, %rs6, %rs4, %p5;
 ; CHECK-NOF16-NEXT:    st.param.b16 [func_retval0], %rs7;
 ; CHECK-NOF16-NEXT:    ret;
@@ -658,23 +650,23 @@ define half @minimum_half(half %a, half %b) {
 ; CHECK-SM80-NOF16:       {
 ; CHECK-SM80-NOF16-NEXT:    .reg .pred %p<6>;
 ; CHECK-SM80-NOF16-NEXT:    .reg .b16 %rs<8>;
-; CHECK-SM80-NOF16-NEXT:    .reg .b32 %f<4>;
+; CHECK-SM80-NOF16-NEXT:    .reg .b32 %r<4>;
 ; CHECK-SM80-NOF16-EMPTY:
 ; CHECK-SM80-NOF16-NEXT:  // %bb.0:
 ; CHECK-SM80-NOF16-NEXT:    ld.param.b16 %rs1, [minimum_half_param_0];
 ; CHECK-SM80-NOF16-NEXT:    ld.param.b16 %rs2, [minimum_half_param_1];
-; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
-; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f2, %rs1;
-; CHECK-SM80-NOF16-NEXT:    setp.lt.f32 %p1, %f2, %f1;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %r1, %rs2;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %r2, %rs1;
+; CHECK-SM80-NOF16-NEXT:    setp.lt.f32 %p1, %r2, %r1;
 ; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs3, %rs1, %rs2, %p1;
-; CHECK-SM80-NOF16-NEXT:    setp.nan.f32 %p2, %f2, %f1;
+; CHECK-SM80-NOF16-NEXT:    setp.nan.f32 %p2, %r2, %r1;
 ; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs4, 0x7E00, %rs3, %p2;
 ; CHECK-SM80-NOF16-NEXT:    setp.eq.s16 %p3, %rs1, -32768;
 ; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs5, %rs1, %rs4, %p3;
 ; CHECK-SM80-NOF16-NEXT:    setp.eq.s16 %p4, %rs2, -32768;
 ; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs6, %rs2, %rs5, %p4;
-; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f3, %rs4;
-; CHECK-SM80-NOF16-NEXT:    setp.eq.f32 %p5, %f3, 0f00000000;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
+; CHECK-SM80-NOF16-NEXT:    setp.eq.f32 %p5, %r3, 0f00000000;
 ; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs7, %rs6, %rs4, %p5;
 ; CHECK-SM80-NOF16-NEXT:    st.param.b16 [func_retval0], %rs7;
 ; CHECK-SM80-NOF16-NEXT:    ret;
@@ -686,46 +678,43 @@ define float @minimum_float(float %a, float %b) {
 ; CHECK-NOF16-LABEL: minimum_float(
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .pred %p<5>;
-; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
-; CHECK-NOF16-NEXT:    .reg .b32 %f<8>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<12>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.b32 %f1, [minimum_float_param_0];
-; CHECK-NOF16-NEXT:    mov.b32 %r1, %f1;
-; CHECK-NOF16-NEXT:    ld.param.b32 %f2, [minimum_float_param_1];
-; CHECK-NOF16-NEXT:    setp.nan.f32 %p1, %f1, %f2;
-; CHECK-NOF16-NEXT:    min.f32 %f3, %f1, %f2;
-; CHECK-NOF16-NEXT:    selp.f32 %f4, 0f7FC00000, %f3, %p1;
-; CHECK-NOF16-NEXT:    setp.eq.s32 %p2, %r1, -2147483648;
-; CHECK-NOF16-NEXT:    selp.f32 %f5, %f1, %f4, %p2;
-; CHECK-NOF16-NEXT:    mov.b32 %r2, %f2;
-; CHECK-NOF16-NEXT:    setp.eq.s32 %p3, %r2, -2147483648;
-; CHECK-NOF16-NEXT:    selp.f32 %f6, %f2, %f5, %p3;
-; CHECK-NOF16-NEXT:    setp.eq.f32 %p4, %f4, 0f00000000;
-; CHECK-NOF16-NEXT:    selp.f32 %f7, %f6, %f4, %p4;
-; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %f7;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r10, [minimum_float_param_0];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r11, [minimum_float_param_1];
+; CHECK-NOF16-NEXT:    setp.nan.f32 %p1, %r10, %r11;
+; CHECK-NOF16-NEXT:    min.f32 %r4, %r10, %r11;
+; CHECK-NOF16-NEXT:    selp.f32 %r5, 0f7FC00000, %r4, %p1;
+; CHECK-NOF16-NEXT:    setp.eq.s32 %p2, %r10, -2147483648;
+; CHECK-NOF16-NEXT:    selp.f32 %r6, %r10, %r5, %p2;
+; CHECK-NOF16-NEXT:    setp.eq.s32 %p3, %r11, -2147483648;
+; CHECK-NOF16-NEXT:    selp.f32 %r8, %r11, %r6, %p3;
+; CHECK-NOF16-NEXT:    setp.eq.f32 %p4, %r5, 0f00000000;
+; CHECK-NOF16-NEXT:    selp.f32 %r9, %r8, %r5, %p4;
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r9;
 ; CHECK-NOF16-NEXT:    ret;
 ;
 ; CHECK-F16-LABEL: minimum_float(
 ; CHECK-F16:       {
-; CHECK-F16-NEXT:    .reg .b32 %f<4>;
+; CHECK-F16-NEXT:    .reg .b32 %r<4>;
 ; CHECK-F16-EMPTY:
 ; CHECK-F16-NEXT:  // %bb.0:
-; CHECK-F16-NEXT:    ld.param.b32 %f1, [minimum_float_param_0];
-; CHECK-F16-NEXT:    ld.param.b32 %f2, [minimum_float_param_1];
-; CHECK-F16-NEXT:    min.NaN.f32 %f3, %f1, %f2;
-; CHECK-F16-NEXT:    st.param.b32 [func_retval0], %f3;
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [minimum_float_param_0];
+; CHECK-F16-NEXT:    ld.param.b32 %r2, [minimum_float_param_1];
+; CHECK-F16-NEXT:    min.NaN.f32 %r3, %r1, %r2;
+; CHECK-F16-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-F16-NEXT:    ret;
 ;
 ; CHECK-SM80-NOF16-LABEL: minimum_float(
 ; CHECK-SM80-NOF16:       {
-; CHECK-SM80-NOF16-NEXT:    .reg .b32 %f<4>;
+; CHECK-SM80-NOF16-NEXT:    .reg .b32 %r<4>;
 ; CHECK-SM80-NOF16-EMPTY:
 ; CHECK-SM80-NOF16-NEXT:  // %bb.0:
-; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %f1, [minimum_float_param_0];
-; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %f2, [minimum_float_param_1];
-; CHECK-SM80-NOF16-NEXT:    min.NaN.f32 %f3, %f1, %f2;
-; CHECK-SM80-NOF16-NEXT:    st.param.b32 [func_retval0], %f3;
+; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %r1, [minimum_float_param_0];
+; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %r2, [minimum_float_param_1];
+; CHECK-SM80-NOF16-NEXT:    min.NaN.f32 %r3, %r1, %r2;
+; CHECK-SM80-NOF16-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-SM80-NOF16-NEXT:    ret;
   %x = call float @llvm.minimum.f32(float %a, float %b)
   ret float %x
@@ -735,40 +724,38 @@ define float @minimum_imm1(float %a) {
 ; CHECK-NOF16-LABEL: minimum_imm1(
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .pred %p<4>;
-; CHECK-NOF16-NEXT:    .reg .b32 %r<2>;
-; CHECK-NOF16-NEXT:    .reg .b32 %f<6>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<8>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.b32 %f1, [minimum_imm1_param_0];
-; CHECK-NOF16-NEXT:    mov.b32 %r1, %f1;
-; CHECK-NOF16-NEXT:    setp.nan.f32 %p1, %f1, %f1;
-; CHECK-NOF16-NEXT:    min.f32 %f2, %f1, 0f00000000;
-; CHECK-NOF16-NEXT:    selp.f32 %f3, 0f7FC00000, %f2, %p1;
-; CHECK-NOF16-NEXT:    setp.eq.s32 %p2, %r1, -2147483648;
-; CHECK-NOF16-NEXT:    selp.f32 %f4, %f1, %f3, %p2;
-; CHECK-NOF16-NEXT:    setp.eq.f32 %p3, %f3, 0f00000000;
-; CHECK-NOF16-NEXT:    selp.f32 %f5, %f4, %f3, %p3;
-; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %f5;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r7, [minimum_imm1_param_0];
+; CHECK-NOF16-NEXT:    setp.nan.f32 %p1, %r7, %r7;
+; CHECK-NOF16-NEXT:    min.f32 %r3, %r7, 0f00000000;
+; CHECK-NOF16-NEXT:    selp.f32 %r4, 0f7FC00000, %r3, %p1;
+; CHECK-NOF16-NEXT:    setp.eq.s32 %p2, %r7, -2147483648;
+; CHECK-NOF16-NEXT:    selp.f32 %r5, %r7, %r4, %p2;
+; CHECK-NOF16-NEXT:    setp.eq.f32 %p3, %r4, 0f00000000;
+; CHECK-NOF16-NEXT:    selp.f32 %r6, %r5, %r4, %p3;
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r6;
 ; CHECK-NOF16-NEXT:    ret;
 ;
 ; CHECK-F16-LABEL: minimum_imm1(
 ; CHECK-F16:       {
-; CHECK-F16-NEXT:    .reg .b32 %f<3>;
+; CHECK-F16-NEXT:    .reg .b32 %r<3>;
 ; CHECK-F16-EMPTY:
 ; CHECK-F16-NEXT:  // %bb.0:
-; CHECK-F16-NEXT:    ld.param.b32 %f1, [minimum_imm1_param_0];
-; CHECK-F16-NEXT:    min.NaN.f32 %f2, %f1, 0f00000000;
-; CHECK-F16-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [minimum_imm1_param_0];
+; CHECK-F16-NEXT:    min.NaN.f32 %r2, %r1, 0f00000000;
+; CHECK-F16-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-F16-NEXT:    ret;
 ;
 ; CHECK-SM80-NOF16-LABEL: minimum_imm1(
 ; CHECK-SM80-NOF16:       {
-; CHECK-SM80-NOF16-NEXT:    .reg .b32 %f<3>;
+; CHECK-SM80-NOF16-NEXT:    .reg .b32 %r<3>;
 ; CHECK-SM80-NOF16-EMPTY:
 ; CHECK-SM80-NOF16-NEXT:  // %bb.0:
-; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %f1, [minimum_imm1_param_0];
-; CHECK-SM80-NOF16-NEXT:    min.NaN.f32 %f2, %f1, 0f00000000;
-; CHECK-SM80-NOF16-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %r1, [minimum_imm1_param_0];
+; CHECK-SM80-NOF16-NEXT:    min.NaN.f32 %r2, %r1, 0f00000000;
+; CHECK-SM80-NOF16-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-SM80-NOF16-NEXT:    ret;
   %x = call float @llvm.minimum.f32(float %a, float 0.0)
   ret float %x
@@ -778,40 +765,38 @@ define float @minimum_imm2(float %a) {
 ; CHECK-NOF16-LABEL: minimum_imm2(
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .pred %p<4>;
-; CHECK-NOF16-NEXT:    .reg .b32 %r<2>;
-; CHECK-NOF16-NEXT:    .reg .b32 %f<6>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<8>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.b32 %f1, [minimum_imm2_param_0];
-; CHECK-NOF16-NEXT:    mov.b32 %r1, %f1;
-; CHECK-NOF16-NEXT:    setp.nan.f32 %p1, %f1, %f1;
-; CHECK-NOF16-NEXT:    min.f32 %f2, %f1, 0f00000000;
-; CHECK-NOF16-NEXT:    selp.f32 %f3, 0f7FC00000, %f2, %p1;
-; CHECK-NOF16-NEXT:    setp.eq.s32 %p2, %r1, -2147483648;
-; CHECK-NOF16-NEXT:    selp.f32 %f4, %f1, %f3, %p2;
-; CHECK-NOF16-NEXT:    setp.eq.f32 %p3, %f3, 0f00000000;
-; CHECK-NOF16-NEXT:    selp.f32 %f5, %f4, %f3, %p3;
-; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %f5;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r7, [minimum_imm2_param_0];
+; CHECK-NOF16-NEXT:    setp.nan.f32 %p1, %r7, %r7;
+; CHECK-NOF16-NEXT:    min.f32 %r3, %r7, 0f00000000;
+; CHECK-NOF16-NEXT:    selp.f32 %r4, 0f7FC00000, %r3, %p1;
+; CHECK-NOF16-NEXT:    setp.eq.s32 %p2, %r7, -2147483648;
+; CHECK-NOF16-NEXT:    selp.f32 %r5, %r7, %r4, %p2;
+; CHECK-NOF16-NEXT:    setp.eq.f32 %p3, %r4, 0f00000000;
+; CHECK-NOF16-NEXT:    selp.f32 %r6, %r5, %r4, %p3;
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r6;
 ; CHECK-NOF16-NEXT:    ret;
 ;
 ; CHECK-F16-LABEL: minimum_imm2(
 ; CHECK-F16:       {
-; CHECK-F16-NEXT:    .reg .b32 %f<3>;
+; CHECK-F16-NEXT:    .reg .b32 %r<3>;
 ; CHECK-F16-EMPTY:
 ; CHECK-F16-NEXT:  // %bb.0:
-; CHECK-F16-NEXT:    ld.param.b32 %f1, [minimum_imm2_param_0];
-; CHECK-F16-NEXT:    min.NaN.f32 %f2, %f1, 0f00000000;
-; CHECK-F16-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [minimum_imm2_param_0];
+; CHECK-F16-NEXT:    min.NaN.f32 %r2, %r1, 0f00000000;
+; CHECK-F16-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-F16-NEXT:    ret;
 ;
 ; CHECK-SM80-NOF16-LABEL: minimum_imm2(
 ; CHECK-SM80-NOF16:       {
-; CHECK-SM80-NOF16-NEXT:    .reg .b32 %f<3>;
+; CHECK-SM80-NOF16-NEXT:    .reg .b32 %r<3>;
 ; CHECK-SM80-NOF16-EMPTY:
 ; CHECK-SM80-NOF16-NEXT:  // %bb.0:
-; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %f1, [minimum_imm2_param_0];
-; CHECK-SM80-NOF16-NEXT:    min.NaN.f32 %f2, %f1, 0f00000000;
-; CHECK-SM80-NOF16-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %r1, [minimum_imm2_param_0];
+; CHECK-SM80-NOF16-NEXT:    min.NaN.f32 %r2, %r1, 0f00000000;
+; CHECK-SM80-NOF16-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-SM80-NOF16-NEXT:    ret;
   %x = call float @llvm.minimum.f32(float 0.0, float %a)
   ret float %x
@@ -821,46 +806,43 @@ define float @minimum_float_ftz(float %a, float %b) #1 {
 ; CHECK-NOF16-LABEL: minimum_float_ftz(
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .pred %p<5>;
-; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
-; CHECK-NOF16-NEXT:    .reg .b32 %f<8>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<12>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.b32 %f1, [minimum_float_ftz_param_0];
-; CHECK-NOF16-NEXT:    mov.b32 %r1, %f1;
-; CHECK-NOF16-NEXT:    ld.param.b32 %f2, [minimum_float_ftz_param_1];
-; CHECK-NOF16-NEXT:    setp.nan.ftz.f32 %p1, %f1, %f2;
-; CHECK-NOF16-NEXT:    min.ftz.f32 %f3, %f1, %f2;
-; CHECK-NOF16-NEXT:    selp.f32 %f4, 0f7FC00000, %f3, %p1;
-; CHECK-NOF16-NEXT:    setp.eq.s32 %p2, %r1, -2147483648;
-; CHECK-NOF16-NEXT:    selp.f32 %f5, %f1, %f4, %p2;
-; CHECK-NOF16-NEXT:    mov.b32 %r2, %f2;
-; CHECK-NOF16-NEXT:    setp.eq.s32 %p3, %r2, -2147483648;
-; CHECK-NOF16-NEXT:    selp.f32 %f6, %f2, %f5, %p3;
-; CHECK-NOF16-NEXT:    setp.eq.ftz.f32 %p4, %f4, 0f00000000;
-; CHECK-NOF16-NEXT:    selp.f32 %f7, %f6, %f4, %p4;
-; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %f7;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r10, [minimum_float_ftz_param_0];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r11, [minimum_float_ftz_param_1];
+; CHECK-NOF16-NEXT:    setp.nan.ftz.f32 %p1, %r10, %r11;
+; CHECK-NOF16-NEXT:    min.ftz.f32 %r4, %r10, %r11;
+; CHECK-NOF16-NEXT:    selp.f32 %r5, 0f7FC00000, %r4, %p1;
+; CHECK-NOF16-NEXT:    setp.eq.s32 %p2, %r10, -2147483648;
+; CHECK-NOF16-NEXT:    selp.f32 %r6, %r10, %r5, %p2;
+; CHECK-NOF16-NEXT:    setp.eq.s32 %p3, %r11, -2147483648;
+; CHECK-NOF16-NEXT:    selp.f32 %r8, %r11, %r6, %p3;
+; CHECK-NOF16-NEXT:    setp.eq.ftz.f32 %p4, %r5, 0f00000000;
+; CHECK-NOF16-NEXT:    selp.f32 %r9, %r8, %r5, %p4;
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r9;
 ; CHECK-NOF16-NEXT:    ret;
 ;
 ; CHECK-F16-LABEL: minimum_float_ftz(
 ; CHECK-F16:       {
-; CHECK-F16-NEXT:    .reg .b32 %f<4>;
+; CHECK-F16-NEXT:    .reg .b32 %r<4>;
 ; CHECK-F16-EMPTY:
 ; CHECK-F16-NEXT:  // %bb.0:
-; CHECK-F16-NEXT:    ld.param.b32 %f1, [minimum_float_ftz_param_0];
-; CHECK-F16-NEXT:    ld.param.b32 %f2, [minimum_float_ftz_param_1];
-; CHECK-F16-NEXT:    min.NaN.ftz.f32 %f3, %f1, %f2;
-; CHECK-F16-NEXT:    st.param.b32 [func_retval0], %f3;
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [minimum_float_ftz_param_0];
+; CHECK-F16-NEXT:    ld.param.b32 %r2, [minimum_float_ftz_param_1];
+; CHECK-F16-NEXT:    min.NaN.ftz.f32 %r3, %r1, %r2;
+; CHECK-F16-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-F16-NEXT:    ret;
 ;
 ; CHECK-SM80-NOF16-LABEL: minimum_float_ftz(
 ; CHECK-SM80-NOF16:       {
-; CHECK-SM80-NOF16-NEXT:    .reg .b32 %f<4>;
+; CHECK-SM80-NOF16-NEXT:    .reg .b32 %r<4>;
 ; CHECK-SM80-NOF16-EMPTY:
 ; CHECK-SM80-NOF16-NEXT:  // %bb.0:
-; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %f1, [minimum_float_ftz_param_0];
-; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %f2, [minimum_float_ftz_param_1];
-; CHECK-SM80-NOF16-NEXT:    min.NaN.ftz.f32 %f3, %f1, %f2;
-; CHECK-SM80-NOF16-NEXT:    st.param.b32 [func_retval0], %f3;
+; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %r1, [minimum_float_ftz_param_0];
+; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %r2, [minimum_float_ftz_param_1];
+; CHECK-SM80-NOF16-NEXT:    min.NaN.ftz.f32 %r3, %r1, %r2;
+; CHECK-SM80-NOF16-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-SM80-NOF16-NEXT:    ret;
   %x = call float @llvm.minimum.f32(float %a, float %b)
   ret float %x
@@ -870,24 +852,21 @@ define double @minimum_double(double %a, double %b) {
 ; CHECK-LABEL: minimum_double(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b64 %rd<3>;
-; CHECK-NEXT:    .reg .b64 %fd<8>;
+; CHECK-NEXT:    .reg .b64 %rd<12>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b64 %fd1, [minimum_double_param_0];
-; CHECK-NEXT:    mov.b64 %rd1, %fd1;
-; CHECK-NEXT:    ld.param.b64 %fd2, [minimum_double_param_1];
-; CHECK-NEXT:    setp.nan.f64 %p1, %fd1, %fd2;
-; CHECK-NEXT:    min.f64 %fd3, %fd1, %fd2;
-; CHECK-NEXT:    selp.f64 %fd4, 0d7FF8000000000000, %fd3, %p1;
-; CHECK-NEXT:    setp.eq.s64 %p2, %rd1, -9223372036854775808;
-; CHECK-NEXT:    selp.f64 %fd5, %fd1, %fd4, %p2;
-; CHECK-NEXT:    mov.b64 %rd2, %fd2;
-; CHECK-NEXT:    setp.eq.s64 %p3, %rd2, -9223372036854775808;
-; CHECK-NEXT:    selp.f64 %fd6, %fd2, %fd5, %p3;
-; CHECK-NEXT:    setp.eq.f64 %p4, %fd4, 0d0000000000000000;
-; CHECK-NEXT:    selp.f64 %fd7, %fd6, %fd4, %p4;
-; CHECK-NEXT:    st.param.b64 [func_retval0], %fd7;
+; CHECK-NEXT:    ld.param.b64 %rd10, [minimum_double_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd11, [minimum_double_param_1];
+; CHECK-NEXT:    setp.nan.f64 %p1, %rd10, %rd11;
+; CHECK-NEXT:    min.f64 %rd4, %rd10, %rd11;
+; CHECK-NEXT:    selp.f64 %rd5, 0d7FF8000000000000, %rd4, %p1;
+; CHECK-NEXT:    setp.eq.s64 %p2, %rd10, -9223372036854775808;
+; CHECK-NEXT:    selp.f64 %rd6, %rd10, %rd5, %p2;
+; CHECK-NEXT:    setp.eq.s64 %p3, %rd11, -9223372036854775808;
+; CHECK-NEXT:    selp.f64 %rd8, %rd11, %rd6, %p3;
+; CHECK-NEXT:    setp.eq.f64 %p4, %rd5, 0d0000000000000000;
+; CHECK-NEXT:    selp.f64 %rd9, %rd8, %rd5, %p4;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd9;
 ; CHECK-NEXT:    ret;
   %x = call double @llvm.minimum.f64(double %a, double %b)
   ret double %x
@@ -898,42 +877,41 @@ define <2 x half> @minimum_v2half(<2 x half> %a, <2 x half> %b) {
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .pred %p<11>;
 ; CHECK-NOF16-NEXT:    .reg .b16 %rs<15>;
-; CHECK-NOF16-NEXT:    .reg .b32 %r<4>;
-; CHECK-NOF16-NEXT:    .reg .b32 %f<7>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<10>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [minimum_v2half_param_0];
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [minimum_v2half_param_1];
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
-; CHECK-NOF16-NEXT:    setp.lt.f32 %p1, %f2, %f1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-NOF16-NEXT:    setp.lt.f32 %p1, %r4, %r3;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, %rs4, %rs2, %p1;
-; CHECK-NOF16-NEXT:    setp.nan.f32 %p2, %f2, %f1;
+; CHECK-NOF16-NEXT:    setp.nan.f32 %p2, %r4, %r3;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs6, 0x7E00, %rs5, %p2;
 ; CHECK-NOF16-NEXT:    setp.eq.s16 %p3, %rs4, -32768;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs7, %rs4, %rs6, %p3;
 ; CHECK-NOF16-NEXT:    setp.eq.s16 %p4, %rs2, -32768;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs8, %rs2, %rs7, %p4;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs6;
-; CHECK-NOF16-NEXT:    setp.eq.f32 %p5, %f3, 0f00000000;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs6;
+; CHECK-NOF16-NEXT:    setp.eq.f32 %p5, %r5, 0f00000000;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs9, %rs8, %rs6, %p5;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f5, %rs3;
-; CHECK-NOF16-NEXT:    setp.lt.f32 %p6, %f5, %f4;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r7, %rs3;
+; CHECK-NOF16-NEXT:    setp.lt.f32 %p6, %r7, %r6;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs10, %rs3, %rs1, %p6;
-; CHECK-NOF16-NEXT:    setp.nan.f32 %p7, %f5, %f4;
+; CHECK-NOF16-NEXT:    setp.nan.f32 %p7, %r7, %r6;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs11, 0x7E00, %rs10, %p7;
 ; CHECK-NOF16-NEXT:    setp.eq.s16 %p8, %rs3, -32768;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs12, %rs3, %rs11, %p8;
 ; CHECK-NOF16-NEXT:    setp.eq.s16 %p9, %rs1, -32768;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs13, %rs1, %rs12, %p9;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f6, %rs11;
-; CHECK-NOF16-NEXT:    setp.eq.f32 %p10, %f6, 0f00000000;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r8, %rs11;
+; CHECK-NOF16-NEXT:    setp.eq.f32 %p10, %r8, 0f00000000;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs14, %rs13, %rs11, %p10;
-; CHECK-NOF16-NEXT:    mov.b32 %r3, {%rs14, %rs9};
-; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NOF16-NEXT:    mov.b32 %r9, {%rs14, %rs9};
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r9;
 ; CHECK-NOF16-NEXT:    ret;
 ;
 ; CHECK-F16-LABEL: minimum_v2half(
@@ -951,42 +929,41 @@ define <2 x half> @minimum_v2half(<2 x half> %a, <2 x half> %b) {
 ; CHECK-SM80-NOF16:       {
 ; CHECK-SM80-NOF16-NEXT:    .reg .pred %p<11>;
 ; CHECK-SM80-NOF16-NEXT:    .reg .b16 %rs<15>;
-; CHECK-SM80-NOF16-NEXT:    .reg .b32 %r<4>;
-; CHECK-SM80-NOF16-NEXT:    .reg .b32 %f<7>;
+; CHECK-SM80-NOF16-NEXT:    .reg .b32 %r<10>;
 ; CHECK-SM80-NOF16-EMPTY:
 ; CHECK-SM80-NOF16-NEXT:  // %bb.0:
 ; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %r1, [minimum_v2half_param_0];
 ; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %r2, [minimum_v2half_param_1];
 ; CHECK-SM80-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
 ; CHECK-SM80-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
-; CHECK-SM80-NOF16-NEXT:    setp.lt.f32 %p1, %f2, %f1;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-SM80-NOF16-NEXT:    setp.lt.f32 %p1, %r4, %r3;
 ; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs5, %rs4, %rs2, %p1;
-; CHECK-SM80-NOF16-NEXT:    setp.nan.f32 %p2, %f2, %f1;
+; CHECK-SM80-NOF16-NEXT:    setp.nan.f32 %p2, %r4, %r3;
 ; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs6, 0x7E00, %rs5, %p2;
 ; CHECK-SM80-NOF16-NEXT:    setp.eq.s16 %p3, %rs4, -32768;
 ; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs7, %rs4, %rs6, %p3;
 ; CHECK-SM80-NOF16-NEXT:    setp.eq.s16 %p4, %rs2, -32768;
 ; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs8, %rs2, %rs7, %p4;
-; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f3, %rs6;
-; CHECK-SM80-NOF16-NEXT:    setp.eq.f32 %p5, %f3, 0f00000000;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %r5, %rs6;
+; CHECK-SM80-NOF16-NEXT:    setp.eq.f32 %p5, %r5, 0f00000000;
 ; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs9, %rs8, %rs6, %p5;
-; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f4, %rs1;
-; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f5, %rs3;
-; CHECK-SM80-NOF16-NEXT:    setp.lt.f32 %p6, %f5, %f4;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %r7, %rs3;
+; CHECK-SM80-NOF16-NEXT:    setp.lt.f32 %p6, %r7, %r6;
 ; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs10, %rs3, %rs1, %p6;
-; CHECK-SM80-NOF16-NEXT:    setp.nan.f32 %p7, %f5, %f4;
+; CHECK-SM80-NOF16-NEXT:    setp.nan.f32 %p7, %r7, %r6;
 ; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs11, 0x7E00, %rs10, %p7;
 ; CHECK-SM80-NOF16-NEXT:    setp.eq.s16 %p8, %rs3, -32768;
 ; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs12, %rs3, %rs11, %p8;
 ; CHECK-SM80-NOF16-NEXT:    setp.eq.s16 %p9, %rs1, -32768;
 ; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs13, %rs1, %rs12, %p9;
-; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f6, %rs11;
-; CHECK-SM80-NOF16-NEXT:    setp.eq.f32 %p10, %f6, 0f00000000;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %r8, %rs11;
+; CHECK-SM80-NOF16-NEXT:    setp.eq.f32 %p10, %r8, 0f00000000;
 ; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs14, %rs13, %rs11, %p10;
-; CHECK-SM80-NOF16-NEXT:    mov.b32 %r3, {%rs14, %rs9};
-; CHECK-SM80-NOF16-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-SM80-NOF16-NEXT:    mov.b32 %r9, {%rs14, %rs9};
+; CHECK-SM80-NOF16-NEXT:    st.param.b32 [func_retval0], %r9;
 ; CHECK-SM80-NOF16-NEXT:    ret;
   %x = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b)
   ret <2 x half> %x
@@ -998,15 +975,15 @@ define half @maxnum_half(half %a, half %b) {
 ; CHECK-NOF16-LABEL: maxnum_half(
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .b16 %rs<4>;
-; CHECK-NOF16-NEXT:    .reg .b32 %f<4>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<4>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
 ; CHECK-NOF16-NEXT:    ld.param.b16 %rs1, [maxnum_half_param_0];
 ; CHECK-NOF16-NEXT:    ld.param.b16 %rs2, [maxnum_half_param_1];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs1;
-; CHECK-NOF16-NEXT:    max.f32 %f3, %f2, %f1;
-; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs3, %f3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r1, %rs2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r2, %rs1;
+; CHECK-NOF16-NEXT:    max.f32 %r3, %r2, %r1;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs3, %r3;
 ; CHECK-NOF16-NEXT:    st.param.b16 [func_retval0], %rs3;
 ; CHECK-NOF16-NEXT:    ret;
 ;
@@ -1024,15 +1001,15 @@ define half @maxnum_half(half %a, half %b) {
 ; CHECK-SM80-NOF16-LABEL: maxnum_half(
 ; CHECK-SM80-NOF16:       {
 ; CHECK-SM80-NOF16-NEXT:    .reg .b16 %rs<4>;
-; CHECK-SM80-NOF16-NEXT:    .reg .b32 %f<4>;
+; CHECK-SM80-NOF16-NEXT:    .reg .b32 %r<4>;
 ; CHECK-SM80-NOF16-EMPTY:
 ; CHECK-SM80-NOF16-NEXT:  // %bb.0:
 ; CHECK-SM80-NOF16-NEXT:    ld.param.b16 %rs1, [maxnum_half_param_0];
 ; CHECK-SM80-NOF16-NEXT:    ld.param.b16 %rs2, [maxnum_half_param_1];
-; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
-; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f2, %rs1;
-; CHECK-SM80-NOF16-NEXT:    max.f32 %f3, %f2, %f1;
-; CHECK-SM80-NOF16-NEXT:    cvt.rn.f16.f32 %rs3, %f3;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %r1, %rs2;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %r2, %rs1;
+; CHECK-SM80-NOF16-NEXT:    max.f32 %r3, %r2, %r1;
+; CHECK-SM80-NOF16-NEXT:    cvt.rn.f16.f32 %rs3, %r3;
 ; CHECK-SM80-NOF16-NEXT:    st.param.b16 [func_retval0], %rs3;
 ; CHECK-SM80-NOF16-NEXT:    ret;
   %x = call half @llvm.maxnum.f16(half %a, half %b)
@@ -1042,12 +1019,12 @@ define half @maxnum_half(half %a, half %b) {
 define float @maxnum_imm1(float %a) {
 ; CHECK-LABEL: maxnum_imm1(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [maxnum_imm1_param_0];
-; CHECK-NEXT:    max.f32 %f2, %f1, 0f00000000;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-NEXT:    ld.param.b32 %r1, [maxnum_imm1_param_0];
+; CHECK-NEXT:    max.f32 %r2, %r1, 0f00000000;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %x = call float @llvm.maxnum.f32(float %a, float 0.0)
   ret float %x
@@ -1056,12 +1033,12 @@ define float @maxnum_imm1(float %a) {
 define float @maxnum_imm2(float %a) {
 ; CHECK-LABEL: maxnum_imm2(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [maxnum_imm2_param_0];
-; CHECK-NEXT:    max.f32 %f2, %f1, 0f00000000;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-NEXT:    ld.param.b32 %r1, [maxnum_imm2_param_0];
+; CHECK-NEXT:    max.f32 %r2, %r1, 0f00000000;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %x = call float @llvm.maxnum.f32(float 0.0, float %a)
   ret float %x
@@ -1070,13 +1047,13 @@ define float @maxnum_imm2(float %a) {
 define float @maxnum_float(float %a, float %b) {
 ; CHECK-LABEL: maxnum_float(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<4>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [maxnum_float_param_0];
-; CHECK-NEXT:    ld.param.b32 %f2, [maxnum_float_param_1];
-; CHECK-NEXT:    max.f32 %f3, %f1, %f2;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f3;
+; CHECK-NEXT:    ld.param.b32 %r1, [maxnum_float_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [maxnum_float_param_1];
+; CHECK-NEXT:    max.f32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
   %x = call float @llvm.maxnum.f32(float %a, float %b)
   ret float %x
@@ -1085,13 +1062,13 @@ define float @maxnum_float(float %a, float %b) {
 define float @maxnum_float_ftz(float %a, float %b) #1 {
 ; CHECK-LABEL: maxnum_float_ftz(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<4>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [maxnum_float_ftz_param_0];
-; CHECK-NEXT:    ld.param.b32 %f2, [maxnum_float_ftz_param_1];
-; CHECK-NEXT:    max.ftz.f32 %f3, %f1, %f2;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f3;
+; CHECK-NEXT:    ld.param.b32 %r1, [maxnum_float_ftz_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [maxnum_float_ftz_param_1];
+; CHECK-NEXT:    max.ftz.f32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
   %x = call float @llvm.maxnum.f32(float %a, float %b)
   ret float %x
@@ -1100,13 +1077,13 @@ define float @maxnum_float_ftz(float %a, float %b) #1 {
 define double @maxnum_double(double %a, double %b) {
 ; CHECK-LABEL: maxnum_double(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %fd<4>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b64 %fd1, [maxnum_double_param_0];
-; CHECK-NEXT:    ld.param.b64 %fd2, [maxnum_double_param_1];
-; CHECK-NEXT:    max.f64 %fd3, %fd1, %fd2;
-; CHECK-NEXT:    st.param.b64 [func_retval0], %fd3;
+; CHECK-NEXT:    ld.param.b64 %rd1, [maxnum_double_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [maxnum_double_param_1];
+; CHECK-NEXT:    max.f64 %rd3, %rd1, %rd2;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; CHECK-NEXT:    ret;
   %x = call double @llvm.maxnum.f64(double %a, double %b)
   ret double %x
@@ -1116,24 +1093,23 @@ define <2 x half> @maxnum_v2half(<2 x half> %a, <2 x half> %b) {
 ; CHECK-NOF16-LABEL: maxnum_v2half(
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .b16 %rs<7>;
-; CHECK-NOF16-NEXT:    .reg .b32 %r<4>;
-; CHECK-NOF16-NEXT:    .reg .b32 %f<7>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<10>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [maxnum_v2half_param_0];
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [maxnum_v2half_param_1];
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
-; CHECK-NOF16-NEXT:    max.f32 %f3, %f2, %f1;
-; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs5, %f3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f5, %rs3;
-; CHECK-NOF16-NEXT:    max.f32 %f6, %f5, %f4;
-; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs6, %f6;
-; CHECK-NOF16-NEXT:    mov.b32 %r3, {%rs6, %rs5};
-; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-NOF16-NEXT:    max.f32 %r5, %r4, %r3;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs5, %r5;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r7, %rs3;
+; CHECK-NOF16-NEXT:    max.f32 %r8, %r7, %r6;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs6, %r8;
+; CHECK-NOF16-NEXT:    mov.b32 %r9, {%rs6, %rs5};
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r9;
 ; CHECK-NOF16-NEXT:    ret;
 ;
 ; CHECK-F16-LABEL: maxnum_v2half(
@@ -1150,24 +1126,23 @@ define <2 x half> @maxnum_v2half(<2 x half> %a, <2 x half> %b) {
 ; CHECK-SM80-NOF16-LABEL: maxnum_v2half(
 ; CHECK-SM80-NOF16:       {
 ; CHECK-SM80-NOF16-NEXT:    .reg .b16 %rs<7>;
-; CHECK-SM80-NOF16-NEXT:    .reg .b32 %r<4>;
-; CHECK-SM80-NOF16-NEXT:    .reg .b32 %f<7>;
+; CHECK-SM80-NOF16-NEXT:    .reg .b32 %r<10>;
 ; CHECK-SM80-NOF16-EMPTY:
 ; CHECK-SM80-NOF16-NEXT:  // %bb.0:
 ; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %r1, [maxnum_v2half_param_0];
 ; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %r2, [maxnum_v2half_param_1];
 ; CHECK-SM80-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
 ; CHECK-SM80-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
-; CHECK-SM80-NOF16-NEXT:    max.f32 %f3, %f2, %f1;
-; CHECK-SM80-NOF16-NEXT:    cvt.rn.f16.f32 %rs5, %f3;
-; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f4, %rs1;
-; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f5, %rs3;
-; CHECK-SM80-NOF16-NEXT:    max.f32 %f6, %f5, %f4;
-; CHECK-SM80-NOF16-NEXT:    cvt.rn.f16.f32 %rs6, %f6;
-; CHECK-SM80-NOF16-NEXT:    mov.b32 %r3, {%rs6, %rs5};
-; CHECK-SM80-NOF16-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-SM80-NOF16-NEXT:    max.f32 %r5, %r4, %r3;
+; CHECK-SM80-NOF16-NEXT:    cvt.rn.f16.f32 %rs5, %r5;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %r7, %rs3;
+; CHECK-SM80-NOF16-NEXT:    max.f32 %r8, %r7, %r6;
+; CHECK-SM80-NOF16-NEXT:    cvt.rn.f16.f32 %rs6, %r8;
+; CHECK-SM80-NOF16-NEXT:    mov.b32 %r9, {%rs6, %rs5};
+; CHECK-SM80-NOF16-NEXT:    st.param.b32 [func_retval0], %r9;
 ; CHECK-SM80-NOF16-NEXT:    ret;
   %x = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
   ret <2 x half> %x
@@ -1180,23 +1155,23 @@ define half @maximum_half(half %a, half %b) {
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .pred %p<6>;
 ; CHECK-NOF16-NEXT:    .reg .b16 %rs<8>;
-; CHECK-NOF16-NEXT:    .reg .b32 %f<4>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<4>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
 ; CHECK-NOF16-NEXT:    ld.param.b16 %rs1, [maximum_half_param_0];
 ; CHECK-NOF16-NEXT:    ld.param.b16 %rs2, [maximum_half_param_1];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs1;
-; CHECK-NOF16-NEXT:    setp.gt.f32 %p1, %f2, %f1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r1, %rs2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r2, %rs1;
+; CHECK-NOF16-NEXT:    setp.gt.f32 %p1, %r2, %r1;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs3, %rs1, %rs2, %p1;
-; CHECK-NOF16-NEXT:    setp.nan.f32 %p2, %f2, %f1;
+; CHECK-NOF16-NEXT:    setp.nan.f32 %p2, %r2, %r1;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs4, 0x7E00, %rs3, %p2;
 ; CHECK-NOF16-NEXT:    setp.eq.s16 %p3, %rs1, 0;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, %rs1, %rs4, %p3;
 ; CHECK-NOF16-NEXT:    setp.eq.s16 %p4, %rs2, 0;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs6, %rs2, %rs5, %p4;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs4;
-; CHECK-NOF16-NEXT:    setp.eq.f32 %p5, %f3, 0f00000000;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
+; CHECK-NOF16-NEXT:    setp.eq.f32 %p5, %r3, 0f00000000;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs7, %rs6, %rs4, %p5;
 ; CHECK-NOF16-NEXT:    st.param.b16 [func_retval0], %rs7;
 ; CHECK-NOF16-NEXT:    ret;
@@ -1216,23 +1191,23 @@ define half @maximum_half(half %a, half %b) {
 ; CHECK-SM80-NOF16:       {
 ; CHECK-SM80-NOF16-NEXT:    .reg .pred %p<6>;
 ; CHECK-SM80-NOF16-NEXT:    .reg .b16 %rs<8>;
-; CHECK-SM80-NOF16-NEXT:    .reg .b32 %f<4>;
+; CHECK-SM80-NOF16-NEXT:    .reg .b32 %r<4>;
 ; CHECK-SM80-NOF16-EMPTY:
 ; CHECK-SM80-NOF16-NEXT:  // %bb.0:
 ; CHECK-SM80-NOF16-NEXT:    ld.param.b16 %rs1, [maximum_half_param_0];
 ; CHECK-SM80-NOF16-NEXT:    ld.param.b16 %rs2, [maximum_half_param_1];
-; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
-; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f2, %rs1;
-; CHECK-SM80-NOF16-NEXT:    setp.gt.f32 %p1, %f2, %f1;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %r1, %rs2;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %r2, %rs1;
+; CHECK-SM80-NOF16-NEXT:    setp.gt.f32 %p1, %r2, %r1;
 ; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs3, %rs1, %rs2, %p1;
-; CHECK-SM80-NOF16-NEXT:    setp.nan.f32 %p2, %f2, %f1;
+; CHECK-SM80-NOF16-NEXT:    setp.nan.f32 %p2, %r2, %r1;
 ; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs4, 0x7E00, %rs3, %p2;
 ; CHECK-SM80-NOF16-NEXT:    setp.eq.s16 %p3, %rs1, 0;
 ; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs5, %rs1, %rs4, %p3;
 ; CHECK-SM80-NOF16-NEXT:    setp.eq.s16 %p4, %rs2, 0;
 ; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs6, %rs2, %rs5, %p4;
-; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f3, %rs4;
-; CHECK-SM80-NOF16-NEXT:    setp.eq.f32 %p5, %f3, 0f00000000;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
+; CHECK-SM80-NOF16-NEXT:    setp.eq.f32 %p5, %r3, 0f00000000;
 ; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs7, %rs6, %rs4, %p5;
 ; CHECK-SM80-NOF16-NEXT:    st.param.b16 [func_retval0], %rs7;
 ; CHECK-SM80-NOF16-NEXT:    ret;
@@ -1244,36 +1219,36 @@ define float @maximum_imm1(float %a) {
 ; CHECK-NOF16-LABEL: maximum_imm1(
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .pred %p<3>;
-; CHECK-NOF16-NEXT:    .reg .b32 %f<5>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<5>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.b32 %f1, [maximum_imm1_param_0];
-; CHECK-NOF16-NEXT:    setp.nan.f32 %p1, %f1, %f1;
-; CHECK-NOF16-NEXT:    max.f32 %f2, %f1, 0f00000000;
-; CHECK-NOF16-NEXT:    selp.f32 %f3, 0f7FC00000, %f2, %p1;
-; CHECK-NOF16-NEXT:    setp.eq.f32 %p2, %f3, 0f00000000;
-; CHECK-NOF16-NEXT:    selp.f32 %f4, 0f00000000, %f3, %p2;
-; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %f4;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [maximum_imm1_param_0];
+; CHECK-NOF16-NEXT:    setp.nan.f32 %p1, %r1, %r1;
+; CHECK-NOF16-NEXT:    max.f32 %r2, %r1, 0f00000000;
+; CHECK-NOF16-NEXT:    selp.f32 %r3, 0f7FC00000, %r2, %p1;
+; CHECK-NOF16-NEXT:    setp.eq.f32 %p2, %r3, 0f00000000;
+; CHECK-NOF16-NEXT:    selp.f32 %r4, 0f00000000, %r3, %p2;
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r4;
 ; CHECK-NOF16-NEXT:    ret;
 ;
 ; CHECK-F16-LABEL: maximum_imm1(
 ; CHECK-F16:       {
-; CHECK-F16-NEXT:    .reg .b32 %f<3>;
+; CHECK-F16-NEXT:    .reg .b32 %r<3>;
 ; CHECK-F16-EMPTY:
 ; CHECK-F16-NEXT:  // %bb.0:
-; CHECK-F16-NEXT:    ld.param.b32 %f1, [maximum_imm1_param_0];
-; CHECK-F16-NEXT:    max.NaN.f32 %f2, %f1, 0f00000000;
-; CHECK-F16-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [maximum_imm1_param_0];
+; CHECK-F16-NEXT:    max.NaN.f32 %r2, %r1, 0f00000000;
+; CHECK-F16-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-F16-NEXT:    ret;
 ;
 ; CHECK-SM80-NOF16-LABEL: maximum_imm1(
 ; CHECK-SM80-NOF16:       {
-; CHECK-SM80-NOF16-NEXT:    .reg .b32 %f<3>;
+; CHECK-SM80-NOF16-NEXT:    .reg .b32 %r<3>;
 ; CHECK-SM80-NOF16-EMPTY:
 ; CHECK-SM80-NOF16-NEXT:  // %bb.0:
-; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %f1, [maximum_imm1_param_0];
-; CHECK-SM80-NOF16-NEXT:    max.NaN.f32 %f2, %f1, 0f00000000;
-; CHECK-SM80-NOF16-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %r1, [maximum_imm1_param_0];
+; CHECK-SM80-NOF16-NEXT:    max.NaN.f32 %r2, %r1, 0f00000000;
+; CHECK-SM80-NOF16-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-SM80-NOF16-NEXT:    ret;
   %x = call float @llvm.maximum.f32(float %a, float 0.0)
   ret float %x
@@ -1283,36 +1258,36 @@ define float @maximum_imm2(float %a) {
 ; CHECK-NOF16-LABEL: maximum_imm2(
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .pred %p<3>;
-; CHECK-NOF16-NEXT:    .reg .b32 %f<5>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<5>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.b32 %f1, [maximum_imm2_param_0];
-; CHECK-NOF16-NEXT:    setp.nan.f32 %p1, %f1, %f1;
-; CHECK-NOF16-NEXT:    max.f32 %f2, %f1, 0f00000000;
-; CHECK-NOF16-NEXT:    selp.f32 %f3, 0f7FC00000, %f2, %p1;
-; CHECK-NOF16-NEXT:    setp.eq.f32 %p2, %f3, 0f00000000;
-; CHECK-NOF16-NEXT:    selp.f32 %f4, 0f00000000, %f3, %p2;
-; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %f4;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [maximum_imm2_param_0];
+; CHECK-NOF16-NEXT:    setp.nan.f32 %p1, %r1, %r1;
+; CHECK-NOF16-NEXT:    max.f32 %r2, %r1, 0f00000000;
+; CHECK-NOF16-NEXT:    selp.f32 %r3, 0f7FC00000, %r2, %p1;
+; CHECK-NOF16-NEXT:    setp.eq.f32 %p2, %r3, 0f00000000;
+; CHECK-NOF16-NEXT:    selp.f32 %r4, 0f00000000, %r3, %p2;
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r4;
 ; CHECK-NOF16-NEXT:    ret;
 ;
 ; CHECK-F16-LABEL: maximum_imm2(
 ; CHECK-F16:       {
-; CHECK-F16-NEXT:    .reg .b32 %f<3>;
+; CHECK-F16-NEXT:    .reg .b32 %r<3>;
 ; CHECK-F16-EMPTY:
 ; CHECK-F16-NEXT:  // %bb.0:
-; CHECK-F16-NEXT:    ld.param.b32 %f1, [maximum_imm2_param_0];
-; CHECK-F16-NEXT:    max.NaN.f32 %f2, %f1, 0f00000000;
-; CHECK-F16-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [maximum_imm2_param_0];
+; CHECK-F16-NEXT:    max.NaN.f32 %r2, %r1, 0f00000000;
+; CHECK-F16-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-F16-NEXT:    ret;
 ;
 ; CHECK-SM80-NOF16-LABEL: maximum_imm2(
 ; CHECK-SM80-NOF16:       {
-; CHECK-SM80-NOF16-NEXT:    .reg .b32 %f<3>;
+; CHECK-SM80-NOF16-NEXT:    .reg .b32 %r<3>;
 ; CHECK-SM80-NOF16-EMPTY:
 ; CHECK-SM80-NOF16-NEXT:  // %bb.0:
-; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %f1, [maximum_imm2_param_0];
-; CHECK-SM80-NOF16-NEXT:    max.NaN.f32 %f2, %f1, 0f00000000;
-; CHECK-SM80-NOF16-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %r1, [maximum_imm2_param_0];
+; CHECK-SM80-NOF16-NEXT:    max.NaN.f32 %r2, %r1, 0f00000000;
+; CHECK-SM80-NOF16-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-SM80-NOF16-NEXT:    ret;
   %x = call float @llvm.maximum.f32(float 0.0, float %a)
   ret float %x
@@ -1322,46 +1297,43 @@ define float @maximum_float(float %a, float %b) {
 ; CHECK-NOF16-LABEL: maximum_float(
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .pred %p<5>;
-; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
-; CHECK-NOF16-NEXT:    .reg .b32 %f<8>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<12>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.b32 %f1, [maximum_float_param_0];
-; CHECK-NOF16-NEXT:    mov.b32 %r1, %f1;
-; CHECK-NOF16-NEXT:    ld.param.b32 %f2, [maximum_float_param_1];
-; CHECK-NOF16-NEXT:    setp.nan.f32 %p1, %f1, %f2;
-; CHECK-NOF16-NEXT:    max.f32 %f3, %f1, %f2;
-; CHECK-NOF16-NEXT:    selp.f32 %f4, 0f7FC00000, %f3, %p1;
-; CHECK-NOF16-NEXT:    setp.eq.s32 %p2, %r1, 0;
-; CHECK-NOF16-NEXT:    selp.f32 %f5, %f1, %f4, %p2;
-; CHECK-NOF16-NEXT:    mov.b32 %r2, %f2;
-; CHECK-NOF16-NEXT:    setp.eq.s32 %p3, %r2, 0;
-; CHECK-NOF16-NEXT:    selp.f32 %f6, %f2, %f5, %p3;
-; CHECK-NOF16-NEXT:    setp.eq.f32 %p4, %f4, 0f00000000;
-; CHECK-NOF16-NEXT:    selp.f32 %f7, %f6, %f4, %p4;
-; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %f7;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r10, [maximum_float_param_0];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r11, [maximum_float_param_1];
+; CHECK-NOF16-NEXT:    setp.nan.f32 %p1, %r10, %r11;
+; CHECK-NOF16-NEXT:    max.f32 %r4, %r10, %r11;
+; CHECK-NOF16-NEXT:    selp.f32 %r5, 0f7FC00000, %r4, %p1;
+; CHECK-NOF16-NEXT:    setp.eq.s32 %p2, %r10, 0;
+; CHECK-NOF16-NEXT:    selp.f32 %r6, %r10, %r5, %p2;
+; CHECK-NOF16-NEXT:    setp.eq.s32 %p3, %r11, 0;
+; CHECK-NOF16-NEXT:    selp.f32 %r8, %r11, %r6, %p3;
+; CHECK-NOF16-NEXT:    setp.eq.f32 %p4, %r5, 0f00000000;
+; CHECK-NOF16-NEXT:    selp.f32 %r9, %r8, %r5, %p4;
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r9;
 ; CHECK-NOF16-NEXT:    ret;
 ;
 ; CHECK-F16-LABEL: maximum_float(
 ; CHECK-F16:       {
-; CHECK-F16-NEXT:    .reg .b32 %f<4>;
+; CHECK-F16-NEXT:    .reg .b32 %r<4>;
 ; CHECK-F16-EMPTY:
 ; CHECK-F16-NEXT:  // %bb.0:
-; CHECK-F16-NEXT:    ld.param.b32 %f1, [maximum_float_param_0];
-; CHECK-F16-NEXT:    ld.param.b32 %f2, [maximum_float_param_1];
-; CHECK-F16-NEXT:    max.NaN.f32 %f3, %f1, %f2;
-; CHECK-F16-NEXT:    st.param.b32 [func_retval0], %f3;
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [maximum_float_param_0];
+; CHECK-F16-NEXT:    ld.param.b32 %r2, [maximum_float_param_1];
+; CHECK-F16-NEXT:    max.NaN.f32 %r3, %r1, %r2;
+; CHECK-F16-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-F16-NEXT:    ret;
 ;
 ; CHECK-SM80-NOF16-LABEL: maximum_float(
 ; CHECK-SM80-NOF16:       {
-; CHECK-SM80-NOF16-NEXT:    .reg .b32 %f<4>;
+; CHECK-SM80-NOF16-NEXT:    .reg .b32 %r<4>;
 ; CHECK-SM80-NOF16-EMPTY:
 ; CHECK-SM80-NOF16-NEXT:  // %bb.0:
-; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %f1, [maximum_float_param_0];
-; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %f2, [maximum_float_param_1];
-; CHECK-SM80-NOF16-NEXT:    max.NaN.f32 %f3, %f1, %f2;
-; CHECK-SM80-NOF16-NEXT:    st.param.b32 [func_retval0], %f3;
+; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %r1, [maximum_float_param_0];
+; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %r2, [maximum_float_param_1];
+; CHECK-SM80-NOF16-NEXT:    max.NaN.f32 %r3, %r1, %r2;
+; CHECK-SM80-NOF16-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-SM80-NOF16-NEXT:    ret;
   %x = call float @llvm.maximum.f32(float %a, float %b)
   ret float %x
@@ -1371,46 +1343,43 @@ define float @maximum_float_ftz(float %a, float %b) #1 {
 ; CHECK-NOF16-LABEL: maximum_float_ftz(
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .pred %p<5>;
-; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
-; CHECK-NOF16-NEXT:    .reg .b32 %f<8>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<12>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.b32 %f1, [maximum_float_ftz_param_0];
-; CHECK-NOF16-NEXT:    mov.b32 %r1, %f1;
-; CHECK-NOF16-NEXT:    ld.param.b32 %f2, [maximum_float_ftz_param_1];
-; CHECK-NOF16-NEXT:    setp.nan.ftz.f32 %p1, %f1, %f2;
-; CHECK-NOF16-NEXT:    max.ftz.f32 %f3, %f1, %f2;
-; CHECK-NOF16-NEXT:    selp.f32 %f4, 0f7FC00000, %f3, %p1;
-; CHECK-NOF16-NEXT:    setp.eq.s32 %p2, %r1, 0;
-; CHECK-NOF16-NEXT:    selp.f32 %f5, %f1, %f4, %p2;
-; CHECK-NOF16-NEXT:    mov.b32 %r2, %f2;
-; CHECK-NOF16-NEXT:    setp.eq.s32 %p3, %r2, 0;
-; CHECK-NOF16-NEXT:    selp.f32 %f6, %f2, %f5, %p3;
-; CHECK-NOF16-NEXT:    setp.eq.ftz.f32 %p4, %f4, 0f00000000;
-; CHECK-NOF16-NEXT:    selp.f32 %f7, %f6, %f4, %p4;
-; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %f7;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r10, [maximum_float_ftz_param_0];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r11, [maximum_float_ftz_param_1];
+; CHECK-NOF16-NEXT:    setp.nan.ftz.f32 %p1, %r10, %r11;
+; CHECK-NOF16-NEXT:    max.ftz.f32 %r4, %r10, %r11;
+; CHECK-NOF16-NEXT:    selp.f32 %r5, 0f7FC00000, %r4, %p1;
+; CHECK-NOF16-NEXT:    setp.eq.s32 %p2, %r10, 0;
+; CHECK-NOF16-NEXT:    selp.f32 %r6, %r10, %r5, %p2;
+; CHECK-NOF16-NEXT:    setp.eq.s32 %p3, %r11, 0;
+; CHECK-NOF16-NEXT:    selp.f32 %r8, %r11, %r6, %p3;
+; CHECK-NOF16-NEXT:    setp.eq.ftz.f32 %p4, %r5, 0f00000000;
+; CHECK-NOF16-NEXT:    selp.f32 %r9, %r8, %r5, %p4;
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r9;
 ; CHECK-NOF16-NEXT:    ret;
 ;
 ; CHECK-F16-LABEL: maximum_float_ftz(
 ; CHECK-F16:       {
-; CHECK-F16-NEXT:    .reg .b32 %f<4>;
+; CHECK-F16-NEXT:    .reg .b32 %r<4>;
 ; CHECK-F16-EMPTY:
 ; CHECK-F16-NEXT:  // %bb.0:
-; CHECK-F16-NEXT:    ld.param.b32 %f1, [maximum_float_ftz_param_0];
-; CHECK-F16-NEXT:    ld.param.b32 %f2, [maximum_float_ftz_param_1];
-; CHECK-F16-NEXT:    max.NaN.ftz.f32 %f3, %f1, %f2;
-; CHECK-F16-NEXT:    st.param.b32 [func_retval0], %f3;
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [maximum_float_ftz_param_0];
+; CHECK-F16-NEXT:    ld.param.b32 %r2, [maximum_float_ftz_param_1];
+; CHECK-F16-NEXT:    max.NaN.ftz.f32 %r3, %r1, %r2;
+; CHECK-F16-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-F16-NEXT:    ret;
 ;
 ; CHECK-SM80-NOF16-LABEL: maximum_float_ftz(
 ; CHECK-SM80-NOF16:       {
-; CHECK-SM80-NOF16-NEXT:    .reg .b32 %f<4>;
+; CHECK-SM80-NOF16-NEXT:    .reg .b32 %r<4>;
 ; CHECK-SM80-NOF16-EMPTY:
 ; CHECK-SM80-NOF16-NEXT:  // %bb.0:
-; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %f1, [maximum_float_ftz_param_0];
-; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %f2, [maximum_float_ftz_param_1];
-; CHECK-SM80-NOF16-NEXT:    max.NaN.ftz.f32 %f3, %f1, %f2;
-; CHECK-SM80-NOF16-NEXT:    st.param.b32 [func_retval0], %f3;
+; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %r1, [maximum_float_ftz_param_0];
+; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %r2, [maximum_float_ftz_param_1];
+; CHECK-SM80-NOF16-NEXT:    max.NaN.ftz.f32 %r3, %r1, %r2;
+; CHECK-SM80-NOF16-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-SM80-NOF16-NEXT:    ret;
   %x = call float @llvm.maximum.f32(float %a, float %b)
   ret float %x
@@ -1420,24 +1389,21 @@ define double @maximum_double(double %a, double %b) {
 ; CHECK-LABEL: maximum_double(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b64 %rd<3>;
-; CHECK-NEXT:    .reg .b64 %fd<8>;
+; CHECK-NEXT:    .reg .b64 %rd<12>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b64 %fd1, [maximum_double_param_0];
-; CHECK-NEXT:    mov.b64 %rd1, %fd1;
-; CHECK-NEXT:    ld.param.b64 %fd2, [maximum_double_param_1];
-; CHECK-NEXT:    setp.nan.f64 %p1, %fd1, %fd2;
-; CHECK-NEXT:    max.f64 %fd3, %fd1, %fd2;
-; CHECK-NEXT:    selp.f64 %fd4, 0d7FF8000000000000, %fd3, %p1;
-; CHECK-NEXT:    setp.eq.s64 %p2, %rd1, 0;
-; CHECK-NEXT:    selp.f64 %fd5, %fd1, %fd4, %p2;
-; CHECK-NEXT:    mov.b64 %rd2, %fd2;
-; CHECK-NEXT:    setp.eq.s64 %p3, %rd2, 0;
-; CHECK-NEXT:    selp.f64 %fd6, %fd2, %fd5, %p3;
-; CHECK-NEXT:    setp.eq.f64 %p4, %fd4, 0d0000000000000000;
-; CHECK-NEXT:    selp.f64 %fd7, %fd6, %fd4, %p4;
-; CHECK-NEXT:    st.param.b64 [func_retval0], %fd7;
+; CHECK-NEXT:    ld.param.b64 %rd10, [maximum_double_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd11, [maximum_double_param_1];
+; CHECK-NEXT:    setp.nan.f64 %p1, %rd10, %rd11;
+; CHECK-NEXT:    max.f64 %rd4, %rd10, %rd11;
+; CHECK-NEXT:    selp.f64 %rd5, 0d7FF8000000000000, %rd4, %p1;
+; CHECK-NEXT:    setp.eq.s64 %p2, %rd10, 0;
+; CHECK-NEXT:    selp.f64 %rd6, %rd10, %rd5, %p2;
+; CHECK-NEXT:    setp.eq.s64 %p3, %rd11, 0;
+; CHECK-NEXT:    selp.f64 %rd8, %rd11, %rd6, %p3;
+; CHECK-NEXT:    setp.eq.f64 %p4, %rd5, 0d0000000000000000;
+; CHECK-NEXT:    selp.f64 %rd9, %rd8, %rd5, %p4;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd9;
 ; CHECK-NEXT:    ret;
   %x = call double @llvm.maximum.f64(double %a, double %b)
   ret double %x
@@ -1448,42 +1414,41 @@ define <2 x half> @maximum_v2half(<2 x half> %a, <2 x half> %b) {
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .pred %p<11>;
 ; CHECK-NOF16-NEXT:    .reg .b16 %rs<15>;
-; CHECK-NOF16-NEXT:    .reg .b32 %r<4>;
-; CHECK-NOF16-NEXT:    .reg .b32 %f<7>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<10>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [maximum_v2half_param_0];
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [maximum_v2half_param_1];
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
-; CHECK-NOF16-NEXT:    setp.gt.f32 %p1, %f2, %f1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-NOF16-NEXT:    setp.gt.f32 %p1, %r4, %r3;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, %rs4, %rs2, %p1;
-; CHECK-NOF16-NEXT:    setp.nan.f32 %p2, %f2, %f1;
+; CHECK-NOF16-NEXT:    setp.nan.f32 %p2, %r4, %r3;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs6, 0x7E00, %rs5, %p2;
 ; CHECK-NOF16-NEXT:    setp.eq.s16 %p3, %rs4, 0;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs7, %rs4, %rs6, %p3;
 ; CHECK-NOF16-NEXT:    setp.eq.s16 %p4, %rs2, 0;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs8, %rs2, %rs7, %p4;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs6;
-; CHECK-NOF16-NEXT:    setp.eq.f32 %p5, %f3, 0f00000000;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs6;
+; CHECK-NOF16-NEXT:    setp.eq.f32 %p5, %r5, 0f00000000;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs9, %rs8, %rs6, %p5;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f5, %rs3;
-; CHECK-NOF16-NEXT:    setp.gt.f32 %p6, %f5, %f4;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r7, %rs3;
+; CHECK-NOF16-NEXT:    setp.gt.f32 %p6, %r7, %r6;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs10, %rs3, %rs1, %p6;
-; CHECK-NOF16-NEXT:    setp.nan.f32 %p7, %f5, %f4;
+; CHECK-NOF16-NEXT:    setp.nan.f32 %p7, %r7, %r6;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs11, 0x7E00, %rs10, %p7;
 ; CHECK-NOF16-NEXT:    setp.eq.s16 %p8, %rs3, 0;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs12, %rs3, %rs11, %p8;
 ; CHECK-NOF16-NEXT:    setp.eq.s16 %p9, %rs1, 0;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs13, %rs1, %rs12, %p9;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f6, %rs11;
-; CHECK-NOF16-NEXT:    setp.eq.f32 %p10, %f6, 0f00000000;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r8, %rs11;
+; CHECK-NOF16-NEXT:    setp.eq.f32 %p10, %r8, 0f00000000;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs14, %rs13, %rs11, %p10;
-; CHECK-NOF16-NEXT:    mov.b32 %r3, {%rs14, %rs9};
-; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NOF16-NEXT:    mov.b32 %r9, {%rs14, %rs9};
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r9;
 ; CHECK-NOF16-NEXT:    ret;
 ;
 ; CHECK-F16-LABEL: maximum_v2half(
@@ -1501,42 +1466,41 @@ define <2 x half> @maximum_v2half(<2 x half> %a, <2 x half> %b) {
 ; CHECK-SM80-NOF16:       {
 ; CHECK-SM80-NOF16-NEXT:    .reg .pred %p<11>;
 ; CHECK-SM80-NOF16-NEXT:    .reg .b16 %rs<15>;
-; CHECK-SM80-NOF16-NEXT:    .reg .b32 %r<4>;
-; CHECK-SM80-NOF16-NEXT:    .reg .b32 %f<7>;
+; CHECK-SM80-NOF16-NEXT:    .reg .b32 %r<10>;
 ; CHECK-SM80-NOF16-EMPTY:
 ; CHECK-SM80-NOF16-NEXT:  // %bb.0:
 ; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %r1, [maximum_v2half_param_0];
 ; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %r2, [maximum_v2half_param_1];
 ; CHECK-SM80-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
 ; CHECK-SM80-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
-; CHECK-SM80-NOF16-NEXT:    setp.gt.f32 %p1, %f2, %f1;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-SM80-NOF16-NEXT:    setp.gt.f32 %p1, %r4, %r3;
 ; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs5, %rs4, %rs2, %p1;
-; CHECK-SM80-NOF16-NEXT:    setp.nan.f32 %p2, %f2, %f1;
+; CHECK-SM80-NOF16-NEXT:    setp.nan.f32 %p2, %r4, %r3;
 ; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs6, 0x7E00, %rs5, %p2;
 ; CHECK-SM80-NOF16-NEXT:    setp.eq.s16 %p3, %rs4, 0;
 ; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs7, %rs4, %rs6, %p3;
 ; CHECK-SM80-NOF16-NEXT:    setp.eq.s16 %p4, %rs2, 0;
 ; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs8, %rs2, %rs7, %p4;
-; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f3, %rs6;
-; CHECK-SM80-NOF16-NEXT:    setp.eq.f32 %p5, %f3, 0f00000000;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %r5, %rs6;
+; CHECK-SM80-NOF16-NEXT:    setp.eq.f32 %p5, %r5, 0f00000000;
 ; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs9, %rs8, %rs6, %p5;
-; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f4, %rs1;
-; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f5, %rs3;
-; CHECK-SM80-NOF16-NEXT:    setp.gt.f32 %p6, %f5, %f4;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %r7, %rs3;
+; CHECK-SM80-NOF16-NEXT:    setp.gt.f32 %p6, %r7, %r6;
 ; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs10, %rs3, %rs1, %p6;
-; CHECK-SM80-NOF16-NEXT:    setp.nan.f32 %p7, %f5, %f4;
+; CHECK-SM80-NOF16-NEXT:    setp.nan.f32 %p7, %r7, %r6;
 ; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs11, 0x7E00, %rs10, %p7;
 ; CHECK-SM80-NOF16-NEXT:    setp.eq.s16 %p8, %rs3, 0;
 ; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs12, %rs3, %rs11, %p8;
 ; CHECK-SM80-NOF16-NEXT:    setp.eq.s16 %p9, %rs1, 0;
 ; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs13, %rs1, %rs12, %p9;
-; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f6, %rs11;
-; CHECK-SM80-NOF16-NEXT:    setp.eq.f32 %p10, %f6, 0f00000000;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %r8, %rs11;
+; CHECK-SM80-NOF16-NEXT:    setp.eq.f32 %p10, %r8, 0f00000000;
 ; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs14, %rs13, %rs11, %p10;
-; CHECK-SM80-NOF16-NEXT:    mov.b32 %r3, {%rs14, %rs9};
-; CHECK-SM80-NOF16-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-SM80-NOF16-NEXT:    mov.b32 %r9, {%rs14, %rs9};
+; CHECK-SM80-NOF16-NEXT:    st.param.b32 [func_retval0], %r9;
 ; CHECK-SM80-NOF16-NEXT:    ret;
   %x = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b)
   ret <2 x half> %x
@@ -1547,14 +1511,14 @@ define <2 x half> @maximum_v2half(<2 x half> %a, <2 x half> %b) {
 define float @fma_float(float %a, float %b, float %c) {
 ; CHECK-LABEL: fma_float(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [fma_float_param_0];
-; CHECK-NEXT:    ld.param.b32 %f2, [fma_float_param_1];
-; CHECK-NEXT:    ld.param.b32 %f3, [fma_float_param_2];
-; CHECK-NEXT:    fma.rn.f32 %f4, %f1, %f2, %f3;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f4;
+; CHECK-NEXT:    ld.param.b32 %r1, [fma_float_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [fma_float_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [fma_float_param_2];
+; CHECK-NEXT:    fma.rn.f32 %r4, %r1, %r2, %r3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
 ; CHECK-NEXT:    ret;
   %x = call float @llvm.fma.f32(float %a, float %b, float %c)
   ret float %x
@@ -1563,14 +1527,14 @@ define float @fma_float(float %a, float %b, float %c) {
 define float @fma_float_ftz(float %a, float %b, float %c) #1 {
 ; CHECK-LABEL: fma_float_ftz(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [fma_float_ftz_param_0];
-; CHECK-NEXT:    ld.param.b32 %f2, [fma_float_ftz_param_1];
-; CHECK-NEXT:    ld.param.b32 %f3, [fma_float_ftz_param_2];
-; CHECK-NEXT:    fma.rn.ftz.f32 %f4, %f1, %f2, %f3;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f4;
+; CHECK-NEXT:    ld.param.b32 %r1, [fma_float_ftz_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [fma_float_ftz_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [fma_float_ftz_param_2];
+; CHECK-NEXT:    fma.rn.ftz.f32 %r4, %r1, %r2, %r3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
 ; CHECK-NEXT:    ret;
   %x = call float @llvm.fma.f32(float %a, float %b, float %c)
   ret float %x
@@ -1579,14 +1543,14 @@ define float @fma_float_ftz(float %a, float %b, float %c) #1 {
 define double @fma_double(double %a, double %b, double %c) {
 ; CHECK-LABEL: fma_double(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %fd<5>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b64 %fd1, [fma_double_param_0];
-; CHECK-NEXT:    ld.param.b64 %fd2, [fma_double_param_1];
-; CHECK-NEXT:    ld.param.b64 %fd3, [fma_double_param_2];
-; CHECK-NEXT:    fma.rn.f64 %fd4, %fd1, %fd2, %fd3;
-; CHECK-NEXT:    st.param.b64 [func_retval0], %fd4;
+; CHECK-NEXT:    ld.param.b64 %rd1, [fma_double_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [fma_double_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd3, [fma_double_param_2];
+; CHECK-NEXT:    fma.rn.f64 %rd4, %rd1, %rd2, %rd3;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd4;
 ; CHECK-NEXT:    ret;
   %x = call double @llvm.fma.f64(double %a, double %b, double %c)
   ret double %x
diff --git a/llvm/test/CodeGen/NVPTX/misched_func_call.ll b/llvm/test/CodeGen/NVPTX/misched_func_call.ll
index 2e12c5041b06b..7e907990147a5 100644
--- a/llvm/test/CodeGen/NVPTX/misched_func_call.ll
+++ b/llvm/test/CodeGen/NVPTX/misched_func_call.ll
@@ -8,8 +8,7 @@ define ptx_kernel void @wombat(i32 %arg, i32 %arg1, i32 %arg2) {
 ; CHECK-LABEL: wombat(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<11>;
-; CHECK-NEXT:    .reg .b64 %rd<2>;
-; CHECK-NEXT:    .reg .b64 %fd<6>;
+; CHECK-NEXT:    .reg .b64 %rd<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %bb
 ; CHECK-NEXT:    ld.param.b32 %r4, [wombat_param_2];
@@ -27,16 +26,16 @@ define ptx_kernel void @wombat(i32 %arg, i32 %arg1, i32 %arg2) {
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
-; CHECK-NEXT:    ld.param.b64 %fd1, [retval0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [retval0];
 ; CHECK-NEXT:    } // callseq 0
 ; CHECK-NEXT:    mul.lo.s32 %r7, %r10, %r3;
 ; CHECK-NEXT:    or.b32 %r8, %r4, %r7;
 ; CHECK-NEXT:    mul.lo.s32 %r9, %r2, %r8;
-; CHECK-NEXT:    cvt.rn.f64.s32 %fd3, %r9;
-; CHECK-NEXT:    cvt.rn.f64.u32 %fd4, %r10;
-; CHECK-NEXT:    add.rn.f64 %fd5, %fd4, %fd3;
-; CHECK-NEXT:    mov.b64 %rd1, 0;
-; CHECK-NEXT:    st.global.b64 [%rd1], %fd5;
+; CHECK-NEXT:    cvt.rn.f64.s32 %rd3, %r9;
+; CHECK-NEXT:    cvt.rn.f64.u32 %rd4, %r10;
+; CHECK-NEXT:    add.rn.f64 %rd5, %rd4, %rd3;
+; CHECK-NEXT:    mov.b64 %rd6, 0;
+; CHECK-NEXT:    st.global.b64 [%rd6], %rd5;
 ; CHECK-NEXT:    mov.b32 %r10, 1;
 ; CHECK-NEXT:    bra.uni $L__BB0_1;
 bb:
diff --git a/llvm/test/CodeGen/NVPTX/param-add.ll b/llvm/test/CodeGen/NVPTX/param-add.ll
index 1840de4494157..78ec183ede04d 100644
--- a/llvm/test/CodeGen/NVPTX/param-add.ll
+++ b/llvm/test/CodeGen/NVPTX/param-add.ll
@@ -14,8 +14,7 @@ declare i32 @callee(%struct.1float %a)
 define i32 @test(%struct.1float alignstack(32) %data) {
 ; CHECK-LABEL: test(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<18>;
-; CHECK-NEXT:    .reg .b32 %f<2>;
+; CHECK-NEXT:    .reg .b32 %r<20>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b8 %r1, [test_param_0+1];
@@ -27,26 +26,25 @@ define i32 @test(%struct.1float alignstack(32) %data) {
 ; CHECK-NEXT:    ld.param.b8 %r7, [test_param_0+2];
 ; CHECK-NEXT:    or.b32 %r8, %r6, %r7;
 ; CHECK-NEXT:    shl.b32 %r9, %r8, 16;
-; CHECK-NEXT:    or.b32 %r17, %r9, %r4;
-; CHECK-NEXT:    mov.b32 %f1, %r17;
-; CHECK-NEXT:    shr.u32 %r12, %r17, 8;
-; CHECK-NEXT:    shr.u32 %r13, %r17, 16;
-; CHECK-NEXT:    shr.u32 %r14, %r17, 24;
+; CHECK-NEXT:    or.b32 %r19, %r9, %r4;
+; CHECK-NEXT:    shr.u32 %r13, %r19, 8;
+; CHECK-NEXT:    shr.u32 %r14, %r19, 16;
+; CHECK-NEXT:    shr.u32 %r15, %r19, 24;
 ; CHECK-NEXT:    { // callseq 0, 0
 ; CHECK-NEXT:    .param .align 1 .b8 param0[4];
-; CHECK-NEXT:    st.param.b8 [param0], %r17;
-; CHECK-NEXT:    st.param.b8 [param0+1], %r12;
-; CHECK-NEXT:    st.param.b8 [param0+2], %r13;
-; CHECK-NEXT:    st.param.b8 [param0+3], %r14;
+; CHECK-NEXT:    st.param.b8 [param0], %r19;
+; CHECK-NEXT:    st.param.b8 [param0+1], %r13;
+; CHECK-NEXT:    st.param.b8 [param0+2], %r14;
+; CHECK-NEXT:    st.param.b8 [param0+3], %r15;
 ; CHECK-NEXT:    .param .b32 retval0;
 ; CHECK-NEXT:    call.uni (retval0),
 ; CHECK-NEXT:    callee,
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
-; CHECK-NEXT:    ld.param.b32 %r15, [retval0];
+; CHECK-NEXT:    ld.param.b32 %r16, [retval0];
 ; CHECK-NEXT:    } // callseq 0
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r15;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r16;
 ; CHECK-NEXT:    ret;
 
   %1 = call i32 @callee(%struct.1float %data)
diff --git a/llvm/test/CodeGen/NVPTX/param-load-store.ll b/llvm/test/CodeGen/NVPTX/param-load-store.ll
index 781156082e540..ce6707c4564bf 100644
--- a/llvm/test/CodeGen/NVPTX/param-load-store.ll
+++ b/llvm/test/CodeGen/NVPTX/param-load-store.ll
@@ -701,13 +701,13 @@ define <5 x i32> @test_v5i32(<5 x i32> %a) {
 ; CHECK: .func  (.param .b32 func_retval0)
 ; CHECK-LABEL: test_f32(
 ; CHECK-NEXT: .param .b32 test_f32_param_0
-; CHECK:      ld.param.b32    [[E:%f[0-9]+]], [test_f32_param_0];
+; CHECK:      ld.param.b32    [[E:%r[0-9]+]], [test_f32_param_0];
 ; CHECK:      .param .b32 param0;
 ; CHECK:      st.param.b32    [param0], [[E]];
 ; CHECK:      .param .b32 retval0;
 ; CHECK:      call.uni (retval0),
 ; CHECK-NEXT: test_f32,
-; CHECK:      ld.param.b32    [[R:%f[0-9]+]], [retval0];
+; CHECK:      ld.param.b32    [[R:%r[0-9]+]], [retval0];
 ; CHECK:      st.param.b32    [func_retval0], [[R]];
 ; CHECK-NEXT: ret;
 define float @test_f32(float %a) {
@@ -976,13 +976,13 @@ define %s_i32 @test_s_i32(%s_i32 %a) {
 ; CHECK: .func  (.param .align 4 .b8 func_retval0[4])
 ; CHECK-LABEL: test_s_f32(
 ; CHECK-NEXT: .param .align 4 .b8 test_s_f32_param_0[4]
-; CHECK:      ld.param.b32    [[E:%f[0-9]+]], [test_s_f32_param_0];
+; CHECK:      ld.param.b32    [[E:%r[0-9]+]], [test_s_f32_param_0];
 ; CHECK:      .param .align 4 .b8 param0[4]
 ; CHECK:      st.param.b32    [param0], [[E]];
 ; CHECK:      .param .align 4 .b8 retval0[4];
 ; CHECK:      call.uni (retval0),
 ; CHECK-NEXT: test_s_f32,
-; CHECK:      ld.param.b32    [[R:%f[0-9]+]], [retval0];
+; CHECK:      ld.param.b32    [[R:%r[0-9]+]], [retval0];
 ; CHECK:      st.param.b32    [func_retval0], [[R]];
 ; CHECK-NEXT: ret;
 define %s_f32 @test_s_f32(%s_f32 %a) {
@@ -1012,9 +1012,9 @@ define %s_i64 @test_s_i64(%s_i64 %a) {
 ; CHECK-LABEL: test_s_i32f32(
 ; CHECK:        .param .align 8 .b8 test_s_i32f32_param_0[24]
 ; CHECK-DAG:    ld.param.b64    [[E4:%rd[0-9]+]], [test_s_i32f32_param_0+16];
-; CHECK-DAG:    ld.param.b32    [[E3:%f[0-9]+]], [test_s_i32f32_param_0+12];
+; CHECK-DAG:    ld.param.b32    [[E3:%r[0-9]+]], [test_s_i32f32_param_0+12];
 ; CHECK-DAG:    ld.param.b32    [[E2:%r[0-9]+]], [test_s_i32f32_param_0+8];
-; CHECK-DAG:    ld.param.b32    [[E1:%f[0-9]+]], [test_s_i32f32_param_0+4];
+; CHECK-DAG:    ld.param.b32    [[E1:%r[0-9]+]], [test_s_i32f32_param_0+4];
 ; CHECK-DAG:    ld.param.b32    [[E0:%r[0-9]+]], [test_s_i32f32_param_0];
 ; CHECK:        .param .align 8 .b8 param0[24];
 ; CHECK-DAG:    st.param.b32    [param0], [[E0]];
@@ -1026,9 +1026,9 @@ define %s_i64 @test_s_i64(%s_i64 %a) {
 ; CHECK:        call.uni (retval0),
 ; CHECK-NEXT:   test_s_i32f32,
 ; CHECK-DAG:    ld.param.b32    [[RE0:%r[0-9]+]], [retval0];
-; CHECK-DAG:    ld.param.b32    [[RE1:%f[0-9]+]], [retval0+4];
+; CHECK-DAG:    ld.param.b32    [[RE1:%r[0-9]+]], [retval0+4];
 ; CHECK-DAG:    ld.param.b32    [[RE2:%r[0-9]+]], [retval0+8];
-; CHECK-DAG:    ld.param.b32    [[RE3:%f[0-9]+]], [retval0+12];
+; CHECK-DAG:    ld.param.b32    [[RE3:%r[0-9]+]], [retval0+12];
 ; CHECK-DAG:    ld.param.b64    [[RE4:%rd[0-9]+]], [retval0+16];
 ; CHECK-DAG:    st.param.b32    [func_retval0], [[RE0]];
 ; CHECK-DAG:    st.param.b32    [func_retval0+4], [[RE1]];
diff --git a/llvm/test/CodeGen/NVPTX/param-overalign.ll b/llvm/test/CodeGen/NVPTX/param-overalign.ll
index e3d611865f1f0..22a648c7a9786 100644
--- a/llvm/test/CodeGen/NVPTX/param-overalign.ll
+++ b/llvm/test/CodeGen/NVPTX/param-overalign.ll
@@ -24,20 +24,20 @@ define float @caller_md(float %a, float %b) {
 ; CHECK-NEXT: )
 ; CHECK-NEXT: {
 
-; CHECK:         ld.param.b32 %f1, [caller_md_param_0];
-; CHECK-NEXT:    ld.param.b32 %f2, [caller_md_param_1];
+; CHECK:         ld.param.b32 %r1, [caller_md_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [caller_md_param_1];
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
-; CHECK-NEXT:    st.param.v2.b32 [param0], {%f1, %f2};
+; CHECK-NEXT:    st.param.v2.b32 [param0], {%r1, %r2};
 ; CHECK-NEXT:    .param .b32 retval0;
 ; CHECK-NEXT:    call.uni (retval0),
 ; CHECK-NEXT:    callee_md,
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
-; CHECK-NEXT:    ld.param.b32 %f3, [retval0];
+; CHECK-NEXT:    ld.param.b32 %r3, [retval0];
 ; CHECK-NEXT:    }
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
   %s1 = insertvalue %struct.float2 poison, float %a, 0
   %s2 = insertvalue %struct.float2 %s1, float %b, 1
@@ -51,9 +51,9 @@ define float @callee_md(%struct.float2 alignstack(8) %a) {
 ; CHECK-NEXT: )
 ; CHECK-NEXT: {
 
-; CHECK:         ld.param.v2.b32 {%f1, %f2}, [callee_md_param_0];
-; CHECK-NEXT:    add.rn.f32 %f3, %f1, %f2;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f3;
+; CHECK:         ld.param.v2.b32 {%r1, %r2}, [callee_md_param_0];
+; CHECK-NEXT:    add.rn.f32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
   %v0 = extractvalue %struct.float2 %a, 0
   %v1 = extractvalue %struct.float2 %a, 1
@@ -68,20 +68,20 @@ define float @caller(float %a, float %b) {
 ; CHECK-NEXT: )
 ; CHECK-NEXT: {
 
-; CHECK:         ld.param.b32 %f1, [caller_param_0];
-; CHECK-NEXT:    ld.param.b32 %f2, [caller_param_1];
+; CHECK:         ld.param.b32 %r1, [caller_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [caller_param_1];
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
-; CHECK-NEXT:    st.param.v2.b32 [param0], {%f1, %f2};
+; CHECK-NEXT:    st.param.v2.b32 [param0], {%r1, %r2};
 ; CHECK-NEXT:    .param .b32 retval0;
 ; CHECK-NEXT:    call.uni (retval0),
 ; CHECK-NEXT:    callee,
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
-; CHECK-NEXT:    ld.param.b32 %f3, [retval0];
+; CHECK-NEXT:    ld.param.b32 %r3, [retval0];
 ; CHECK-NEXT:    }
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
   %s1 = insertvalue %struct.float2 poison, float %a, 0
   %s2 = insertvalue %struct.float2 %s1, float %b, 1
@@ -95,9 +95,9 @@ define float @callee(%struct.float2 alignstack(8) %a ) {
 ; CHECK-NEXT: )
 ; CHECK-NEXT: {
 
-; CHECK:         ld.param.v2.b32 {%f1, %f2}, [callee_param_0];
-; CHECK-NEXT:    add.rn.f32 %f3, %f1, %f2;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f3;
+; CHECK:         ld.param.v2.b32 {%r1, %r2}, [callee_param_0];
+; CHECK-NEXT:    add.rn.f32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
   %v0 = extractvalue %struct.float2 %a, 0
   %v1 = extractvalue %struct.float2 %a, 1
diff --git a/llvm/test/CodeGen/NVPTX/proxy-reg-erasure-ptx.ll b/llvm/test/CodeGen/NVPTX/proxy-reg-erasure-ptx.ll
index c39716bef4d71..3649ef53b0881 100644
--- a/llvm/test/CodeGen/NVPTX/proxy-reg-erasure-ptx.ll
+++ b/llvm/test/CodeGen/NVPTX/proxy-reg-erasure-ptx.ll
@@ -107,10 +107,10 @@ declare float @callee_f32()
 define  float @check_f32() {
   ; PTX-LABEL: check_f32
   ; PTX-DAG: { // callseq {{[0-9]+}}, {{[0-9]+}}
-  ; PTX-DAG: ld.param.b32 [[LD:%f[0-9]+]], [retval0];
+  ; PTX-DAG: ld.param.b32 [[LD:%r[0-9]+]], [retval0];
   ; PTX-DAG: } // callseq {{[0-9]+}}
 
-  ; PTX-WITHOUT-DAG: mov.b32 [[PROXY:%f[0-9]+]], [[LD]];
+  ; PTX-WITHOUT-DAG: mov.b32 [[PROXY:%r[0-9]+]], [[LD]];
   ; PTX-WITHOUT-DAG: st.param.b32 [func_retval0], [[PROXY]];
   ; PTX-WITH-DAG:    st.param.b32 [func_retval0], [[LD]];
 
@@ -122,10 +122,10 @@ declare double @callee_f64()
 define  double @check_f64() {
   ; PTX-LABEL: check_f64
   ; PTX-DAG: { // callseq {{[0-9]+}}, {{[0-9]+}}
-  ; PTX-DAG: ld.param.b64 [[LD:%fd[0-9]+]], [retval0];
+  ; PTX-DAG: ld.param.b64 [[LD:%rd[0-9]+]], [retval0];
   ; PTX-DAG: } // callseq {{[0-9]+}}
 
-  ; PTX-WITHOUT-DAG: mov.b64 [[PROXY:%fd[0-9]+]], [[LD]];
+  ; PTX-WITHOUT-DAG: mov.b64 [[PROXY:%rd[0-9]+]], [[LD]];
   ; PTX-WITHOUT-DAG: st.param.b64 [func_retval0], [[PROXY]];
   ; PTX-WITH-DAG:    st.param.b64 [func_retval0], [[LD]];
 
@@ -170,11 +170,11 @@ declare <2 x double> @callee_vec_f64()
 define  <2 x double> @check_vec_f64() {
   ; PTX-LABEL: check_vec_f64
   ; PTX-DAG: { // callseq {{[0-9]+}}, {{[0-9]+}}
-  ; PTX-DAG: ld.param.v2.b64 {[[LD0:%fd[0-9]+]], [[LD1:%fd[0-9]+]]}, [retval0];
+  ; PTX-DAG: ld.param.v2.b64 {[[LD0:%rd[0-9]+]], [[LD1:%rd[0-9]+]]}, [retval0];
   ; PTX-DAG: } // callseq {{[0-9]+}}
 
-  ; PTX-WITHOUT-DAG: mov.b64 [[PROXY0:%fd[0-9]+]], [[LD0]];
-  ; PTX-WITHOUT-DAG: mov.b64 [[PROXY1:%fd[0-9]+]], [[LD1]];
+  ; PTX-WITHOUT-DAG: mov.b64 [[PROXY0:%rd[0-9]+]], [[LD0]];
+  ; PTX-WITHOUT-DAG: mov.b64 [[PROXY1:%rd[0-9]+]], [[LD1]];
   ; PTX-WITHOUT-DAG: st.param.v2.b64 [func_retval0], {[[PROXY0]], [[PROXY1]]};
   ; PTX-WITH-DAG:    st.param.v2.b64 [func_retval0], {[[LD0]], [[LD1]]};
 
diff --git a/llvm/test/CodeGen/NVPTX/rcp-opt.ll b/llvm/test/CodeGen/NVPTX/rcp-opt.ll
index e0ef5baf21bfa..d8b9e36cd3cb5 100644
--- a/llvm/test/CodeGen/NVPTX/rcp-opt.ll
+++ b/llvm/test/CodeGen/NVPTX/rcp-opt.ll
@@ -9,13 +9,13 @@ target triple = "nvptx64-nvidia-cuda"
 define double @test1(double %in) {
 ; CHECK-LABEL: test1(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %fd<4>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b64 %fd1, [test1_param_0];
-; CHECK-NEXT:    rcp.rn.f64 %fd2, %fd1;
-; CHECK-NEXT:    neg.f64 %fd3, %fd2;
-; CHECK-NEXT:    st.param.b64 [func_retval0], %fd3;
+; CHECK-NEXT:    ld.param.b64 %rd1, [test1_param_0];
+; CHECK-NEXT:    rcp.rn.f64 %rd2, %rd1;
+; CHECK-NEXT:    neg.f64 %rd3, %rd2;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; CHECK-NEXT:    ret;
   %div = fdiv double 1.000000e+00, %in
   %neg = fsub double -0.000000e+00, %div
@@ -27,13 +27,13 @@ define double @test1(double %in) {
 define double @test2(double %in) {
 ; CHECK-LABEL: test2(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %fd<4>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b64 %fd1, [test2_param_0];
-; CHECK-NEXT:    rcp.rn.f64 %fd2, %fd1;
-; CHECK-NEXT:    neg.f64 %fd3, %fd2;
-; CHECK-NEXT:    st.param.b64 [func_retval0], %fd3;
+; CHECK-NEXT:    ld.param.b64 %rd1, [test2_param_0];
+; CHECK-NEXT:    rcp.rn.f64 %rd2, %rd1;
+; CHECK-NEXT:    neg.f64 %rd3, %rd2;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; CHECK-NEXT:    ret;
   %div = fdiv double -1.000000e+00, %in
   ret double %div
@@ -44,13 +44,13 @@ define double @test2(double %in) {
 define double @test3(double %in) {
 ; CHECK-LABEL: test3(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %fd<4>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b64 %fd1, [test3_param_0];
-; CHECK-NEXT:    rcp.rn.f64 %fd2, %fd1;
-; CHECK-NEXT:    neg.f64 %fd3, %fd2;
-; CHECK-NEXT:    st.param.b64 [func_retval0], %fd3;
+; CHECK-NEXT:    ld.param.b64 %rd1, [test3_param_0];
+; CHECK-NEXT:    rcp.rn.f64 %rd2, %rd1;
+; CHECK-NEXT:    neg.f64 %rd3, %rd2;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; CHECK-NEXT:    ret;
   %neg = fsub double -0.000000e+00, %in
   %div = fdiv double 1.000000e+00, %neg
diff --git a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll
index 180b90ff90a7b..d5b451dad7bc3 100644
--- a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll
+++ b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll
@@ -115,20 +115,20 @@ define half @reduce_fadd_half_reassoc_nonpow2(<7 x half> %in) {
 define float @reduce_fadd_float(<8 x float> %in) {
 ; CHECK-LABEL: reduce_fadd_float(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<17>;
+; CHECK-NEXT:    .reg .b32 %r<17>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b32 {%f5, %f6, %f7, %f8}, [reduce_fadd_float_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fadd_float_param_0];
-; CHECK-NEXT:    add.rn.f32 %f9, %f1, 0f00000000;
-; CHECK-NEXT:    add.rn.f32 %f10, %f9, %f2;
-; CHECK-NEXT:    add.rn.f32 %f11, %f10, %f3;
-; CHECK-NEXT:    add.rn.f32 %f12, %f11, %f4;
-; CHECK-NEXT:    add.rn.f32 %f13, %f12, %f5;
-; CHECK-NEXT:    add.rn.f32 %f14, %f13, %f6;
-; CHECK-NEXT:    add.rn.f32 %f15, %f14, %f7;
-; CHECK-NEXT:    add.rn.f32 %f16, %f15, %f8;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f16;
+; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fadd_float_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_float_param_0];
+; CHECK-NEXT:    add.rn.f32 %r9, %r1, 0f00000000;
+; CHECK-NEXT:    add.rn.f32 %r10, %r9, %r2;
+; CHECK-NEXT:    add.rn.f32 %r11, %r10, %r3;
+; CHECK-NEXT:    add.rn.f32 %r12, %r11, %r4;
+; CHECK-NEXT:    add.rn.f32 %r13, %r12, %r5;
+; CHECK-NEXT:    add.rn.f32 %r14, %r13, %r6;
+; CHECK-NEXT:    add.rn.f32 %r15, %r14, %r7;
+; CHECK-NEXT:    add.rn.f32 %r16, %r15, %r8;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r16;
 ; CHECK-NEXT:    ret;
   %res = call float @llvm.vector.reduce.fadd(float 0.0, <8 x float> %in)
   ret float %res
@@ -137,20 +137,20 @@ define float @reduce_fadd_float(<8 x float> %in) {
 define float @reduce_fadd_float_reassoc(<8 x float> %in) {
 ; CHECK-LABEL: reduce_fadd_float_reassoc(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<17>;
+; CHECK-NEXT:    .reg .b32 %r<17>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b32 {%f5, %f6, %f7, %f8}, [reduce_fadd_float_reassoc_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fadd_float_reassoc_param_0];
-; CHECK-NEXT:    add.rn.f32 %f9, %f3, %f7;
-; CHECK-NEXT:    add.rn.f32 %f10, %f1, %f5;
-; CHECK-NEXT:    add.rn.f32 %f11, %f4, %f8;
-; CHECK-NEXT:    add.rn.f32 %f12, %f2, %f6;
-; CHECK-NEXT:    add.rn.f32 %f13, %f12, %f11;
-; CHECK-NEXT:    add.rn.f32 %f14, %f10, %f9;
-; CHECK-NEXT:    add.rn.f32 %f15, %f14, %f13;
-; CHECK-NEXT:    add.rn.f32 %f16, %f15, 0f00000000;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f16;
+; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fadd_float_reassoc_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_float_reassoc_param_0];
+; CHECK-NEXT:    add.rn.f32 %r9, %r3, %r7;
+; CHECK-NEXT:    add.rn.f32 %r10, %r1, %r5;
+; CHECK-NEXT:    add.rn.f32 %r11, %r4, %r8;
+; CHECK-NEXT:    add.rn.f32 %r12, %r2, %r6;
+; CHECK-NEXT:    add.rn.f32 %r13, %r12, %r11;
+; CHECK-NEXT:    add.rn.f32 %r14, %r10, %r9;
+; CHECK-NEXT:    add.rn.f32 %r15, %r14, %r13;
+; CHECK-NEXT:    add.rn.f32 %r16, %r15, 0f00000000;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r16;
 ; CHECK-NEXT:    ret;
   %res = call reassoc float @llvm.vector.reduce.fadd(float 0.0, <8 x float> %in)
   ret float %res
@@ -159,20 +159,20 @@ define float @reduce_fadd_float_reassoc(<8 x float> %in) {
 define float @reduce_fadd_float_reassoc_nonpow2(<7 x float> %in) {
 ; CHECK-LABEL: reduce_fadd_float_reassoc_nonpow2(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<15>;
+; CHECK-NEXT:    .reg .b32 %r<15>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f7, [reduce_fadd_float_reassoc_nonpow2_param_0+24];
-; CHECK-NEXT:    ld.param.v2.b32 {%f5, %f6}, [reduce_fadd_float_reassoc_nonpow2_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fadd_float_reassoc_nonpow2_param_0];
-; CHECK-NEXT:    add.rn.f32 %f8, %f3, %f7;
-; CHECK-NEXT:    add.rn.f32 %f9, %f1, %f5;
-; CHECK-NEXT:    add.rn.f32 %f10, %f9, %f8;
-; CHECK-NEXT:    add.rn.f32 %f11, %f2, %f6;
-; CHECK-NEXT:    add.rn.f32 %f12, %f11, %f4;
-; CHECK-NEXT:    add.rn.f32 %f13, %f10, %f12;
-; CHECK-NEXT:    add.rn.f32 %f14, %f13, 0f00000000;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f14;
+; CHECK-NEXT:    ld.param.b32 %r7, [reduce_fadd_float_reassoc_nonpow2_param_0+24];
+; CHECK-NEXT:    ld.param.v2.b32 {%r5, %r6}, [reduce_fadd_float_reassoc_nonpow2_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_float_reassoc_nonpow2_param_0];
+; CHECK-NEXT:    add.rn.f32 %r8, %r3, %r7;
+; CHECK-NEXT:    add.rn.f32 %r9, %r1, %r5;
+; CHECK-NEXT:    add.rn.f32 %r10, %r9, %r8;
+; CHECK-NEXT:    add.rn.f32 %r11, %r2, %r6;
+; CHECK-NEXT:    add.rn.f32 %r12, %r11, %r4;
+; CHECK-NEXT:    add.rn.f32 %r13, %r10, %r12;
+; CHECK-NEXT:    add.rn.f32 %r14, %r13, 0f00000000;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r14;
 ; CHECK-NEXT:    ret;
   %res = call reassoc float @llvm.vector.reduce.fadd(float 0.0, <7 x float> %in)
   ret float %res
@@ -274,19 +274,19 @@ define half @reduce_fmul_half_reassoc_nonpow2(<7 x half> %in) {
 define float @reduce_fmul_float(<8 x float> %in) {
 ; CHECK-LABEL: reduce_fmul_float(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<16>;
+; CHECK-NEXT:    .reg .b32 %r<16>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b32 {%f5, %f6, %f7, %f8}, [reduce_fmul_float_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fmul_float_param_0];
-; CHECK-NEXT:    mul.rn.f32 %f9, %f1, %f2;
-; CHECK-NEXT:    mul.rn.f32 %f10, %f9, %f3;
-; CHECK-NEXT:    mul.rn.f32 %f11, %f10, %f4;
-; CHECK-NEXT:    mul.rn.f32 %f12, %f11, %f5;
-; CHECK-NEXT:    mul.rn.f32 %f13, %f12, %f6;
-; CHECK-NEXT:    mul.rn.f32 %f14, %f13, %f7;
-; CHECK-NEXT:    mul.rn.f32 %f15, %f14, %f8;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f15;
+; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmul_float_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_float_param_0];
+; CHECK-NEXT:    mul.rn.f32 %r9, %r1, %r2;
+; CHECK-NEXT:    mul.rn.f32 %r10, %r9, %r3;
+; CHECK-NEXT:    mul.rn.f32 %r11, %r10, %r4;
+; CHECK-NEXT:    mul.rn.f32 %r12, %r11, %r5;
+; CHECK-NEXT:    mul.rn.f32 %r13, %r12, %r6;
+; CHECK-NEXT:    mul.rn.f32 %r14, %r13, %r7;
+; CHECK-NEXT:    mul.rn.f32 %r15, %r14, %r8;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r15;
 ; CHECK-NEXT:    ret;
   %res = call float @llvm.vector.reduce.fmul(float 1.0, <8 x float> %in)
   ret float %res
@@ -295,19 +295,19 @@ define float @reduce_fmul_float(<8 x float> %in) {
 define float @reduce_fmul_float_reassoc(<8 x float> %in) {
 ; CHECK-LABEL: reduce_fmul_float_reassoc(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<16>;
+; CHECK-NEXT:    .reg .b32 %r<16>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b32 {%f5, %f6, %f7, %f8}, [reduce_fmul_float_reassoc_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fmul_float_reassoc_param_0];
-; CHECK-NEXT:    mul.rn.f32 %f9, %f3, %f7;
-; CHECK-NEXT:    mul.rn.f32 %f10, %f1, %f5;
-; CHECK-NEXT:    mul.rn.f32 %f11, %f4, %f8;
-; CHECK-NEXT:    mul.rn.f32 %f12, %f2, %f6;
-; CHECK-NEXT:    mul.rn.f32 %f13, %f12, %f11;
-; CHECK-NEXT:    mul.rn.f32 %f14, %f10, %f9;
-; CHECK-NEXT:    mul.rn.f32 %f15, %f14, %f13;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f15;
+; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmul_float_reassoc_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_float_reassoc_param_0];
+; CHECK-NEXT:    mul.rn.f32 %r9, %r3, %r7;
+; CHECK-NEXT:    mul.rn.f32 %r10, %r1, %r5;
+; CHECK-NEXT:    mul.rn.f32 %r11, %r4, %r8;
+; CHECK-NEXT:    mul.rn.f32 %r12, %r2, %r6;
+; CHECK-NEXT:    mul.rn.f32 %r13, %r12, %r11;
+; CHECK-NEXT:    mul.rn.f32 %r14, %r10, %r9;
+; CHECK-NEXT:    mul.rn.f32 %r15, %r14, %r13;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r15;
 ; CHECK-NEXT:    ret;
   %res = call reassoc float @llvm.vector.reduce.fmul(float 1.0, <8 x float> %in)
   ret float %res
@@ -316,19 +316,19 @@ define float @reduce_fmul_float_reassoc(<8 x float> %in) {
 define float @reduce_fmul_float_reassoc_nonpow2(<7 x float> %in) {
 ; CHECK-LABEL: reduce_fmul_float_reassoc_nonpow2(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<14>;
+; CHECK-NEXT:    .reg .b32 %r<14>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f7, [reduce_fmul_float_reassoc_nonpow2_param_0+24];
-; CHECK-NEXT:    ld.param.v2.b32 {%f5, %f6}, [reduce_fmul_float_reassoc_nonpow2_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fmul_float_reassoc_nonpow2_param_0];
-; CHECK-NEXT:    mul.rn.f32 %f8, %f3, %f7;
-; CHECK-NEXT:    mul.rn.f32 %f9, %f1, %f5;
-; CHECK-NEXT:    mul.rn.f32 %f10, %f9, %f8;
-; CHECK-NEXT:    mul.rn.f32 %f11, %f2, %f6;
-; CHECK-NEXT:    mul.rn.f32 %f12, %f11, %f4;
-; CHECK-NEXT:    mul.rn.f32 %f13, %f10, %f12;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f13;
+; CHECK-NEXT:    ld.param.b32 %r7, [reduce_fmul_float_reassoc_nonpow2_param_0+24];
+; CHECK-NEXT:    ld.param.v2.b32 {%r5, %r6}, [reduce_fmul_float_reassoc_nonpow2_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_float_reassoc_nonpow2_param_0];
+; CHECK-NEXT:    mul.rn.f32 %r8, %r3, %r7;
+; CHECK-NEXT:    mul.rn.f32 %r9, %r1, %r5;
+; CHECK-NEXT:    mul.rn.f32 %r10, %r9, %r8;
+; CHECK-NEXT:    mul.rn.f32 %r11, %r2, %r6;
+; CHECK-NEXT:    mul.rn.f32 %r12, %r11, %r4;
+; CHECK-NEXT:    mul.rn.f32 %r13, %r10, %r12;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r13;
 ; CHECK-NEXT:    ret;
   %res = call reassoc float @llvm.vector.reduce.fmul(float 1.0, <7 x float> %in)
   ret float %res
@@ -404,19 +404,19 @@ define float @reduce_fmax_float(<8 x float> %in) {
 ;
 ; CHECK-LABEL: reduce_fmax_float(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<16>;
+; CHECK-NEXT:    .reg .b32 %r<16>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b32 {%f5, %f6, %f7, %f8}, [reduce_fmax_float_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fmax_float_param_0];
-; CHECK-NEXT:    max.f32 %f9, %f4, %f8;
-; CHECK-NEXT:    max.f32 %f10, %f2, %f6;
-; CHECK-NEXT:    max.f32 %f11, %f10, %f9;
-; CHECK-NEXT:    max.f32 %f12, %f3, %f7;
-; CHECK-NEXT:    max.f32 %f13, %f1, %f5;
-; CHECK-NEXT:    max.f32 %f14, %f13, %f12;
-; CHECK-NEXT:    max.f32 %f15, %f14, %f11;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f15;
+; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_param_0];
+; CHECK-NEXT:    max.f32 %r9, %r4, %r8;
+; CHECK-NEXT:    max.f32 %r10, %r2, %r6;
+; CHECK-NEXT:    max.f32 %r11, %r10, %r9;
+; CHECK-NEXT:    max.f32 %r12, %r3, %r7;
+; CHECK-NEXT:    max.f32 %r13, %r1, %r5;
+; CHECK-NEXT:    max.f32 %r14, %r13, %r12;
+; CHECK-NEXT:    max.f32 %r15, %r14, %r11;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r15;
 ; CHECK-NEXT:    ret;
   %res = call float @llvm.vector.reduce.fmax(<8 x float> %in)
   ret float %res
@@ -426,19 +426,19 @@ define float @reduce_fmax_float_reassoc(<8 x float> %in) {
 ;
 ; CHECK-LABEL: reduce_fmax_float_reassoc(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<16>;
+; CHECK-NEXT:    .reg .b32 %r<16>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b32 {%f5, %f6, %f7, %f8}, [reduce_fmax_float_reassoc_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fmax_float_reassoc_param_0];
-; CHECK-NEXT:    max.f32 %f9, %f4, %f8;
-; CHECK-NEXT:    max.f32 %f10, %f2, %f6;
-; CHECK-NEXT:    max.f32 %f11, %f10, %f9;
-; CHECK-NEXT:    max.f32 %f12, %f3, %f7;
-; CHECK-NEXT:    max.f32 %f13, %f1, %f5;
-; CHECK-NEXT:    max.f32 %f14, %f13, %f12;
-; CHECK-NEXT:    max.f32 %f15, %f14, %f11;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f15;
+; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_reassoc_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_reassoc_param_0];
+; CHECK-NEXT:    max.f32 %r9, %r4, %r8;
+; CHECK-NEXT:    max.f32 %r10, %r2, %r6;
+; CHECK-NEXT:    max.f32 %r11, %r10, %r9;
+; CHECK-NEXT:    max.f32 %r12, %r3, %r7;
+; CHECK-NEXT:    max.f32 %r13, %r1, %r5;
+; CHECK-NEXT:    max.f32 %r14, %r13, %r12;
+; CHECK-NEXT:    max.f32 %r15, %r14, %r11;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r15;
 ; CHECK-NEXT:    ret;
   %res = call reassoc float @llvm.vector.reduce.fmax(<8 x float> %in)
   ret float %res
@@ -448,19 +448,19 @@ define float @reduce_fmax_float_reassoc_nonpow2(<7 x float> %in) {
 ;
 ; CHECK-LABEL: reduce_fmax_float_reassoc_nonpow2(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<14>;
+; CHECK-NEXT:    .reg .b32 %r<14>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f7, [reduce_fmax_float_reassoc_nonpow2_param_0+24];
-; CHECK-NEXT:    ld.param.v2.b32 {%f5, %f6}, [reduce_fmax_float_reassoc_nonpow2_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fmax_float_reassoc_nonpow2_param_0];
-; CHECK-NEXT:    max.f32 %f8, %f3, %f7;
-; CHECK-NEXT:    max.f32 %f9, %f1, %f5;
-; CHECK-NEXT:    max.f32 %f10, %f9, %f8;
-; CHECK-NEXT:    max.f32 %f11, %f2, %f6;
-; CHECK-NEXT:    max.f32 %f12, %f11, %f4;
-; CHECK-NEXT:    max.f32 %f13, %f10, %f12;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f13;
+; CHECK-NEXT:    ld.param.b32 %r7, [reduce_fmax_float_reassoc_nonpow2_param_0+24];
+; CHECK-NEXT:    ld.param.v2.b32 {%r5, %r6}, [reduce_fmax_float_reassoc_nonpow2_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_reassoc_nonpow2_param_0];
+; CHECK-NEXT:    max.f32 %r8, %r3, %r7;
+; CHECK-NEXT:    max.f32 %r9, %r1, %r5;
+; CHECK-NEXT:    max.f32 %r10, %r9, %r8;
+; CHECK-NEXT:    max.f32 %r11, %r2, %r6;
+; CHECK-NEXT:    max.f32 %r12, %r11, %r4;
+; CHECK-NEXT:    max.f32 %r13, %r10, %r12;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r13;
 ; CHECK-NEXT:    ret;
   %res = call reassoc float @llvm.vector.reduce.fmax(<7 x float> %in)
   ret float %res
@@ -536,19 +536,19 @@ define float @reduce_fmin_float(<8 x float> %in) {
 ;
 ; CHECK-LABEL: reduce_fmin_float(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<16>;
+; CHECK-NEXT:    .reg .b32 %r<16>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b32 {%f5, %f6, %f7, %f8}, [reduce_fmin_float_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fmin_float_param_0];
-; CHECK-NEXT:    min.f32 %f9, %f4, %f8;
-; CHECK-NEXT:    min.f32 %f10, %f2, %f6;
-; CHECK-NEXT:    min.f32 %f11, %f10, %f9;
-; CHECK-NEXT:    min.f32 %f12, %f3, %f7;
-; CHECK-NEXT:    min.f32 %f13, %f1, %f5;
-; CHECK-NEXT:    min.f32 %f14, %f13, %f12;
-; CHECK-NEXT:    min.f32 %f15, %f14, %f11;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f15;
+; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_param_0];
+; CHECK-NEXT:    min.f32 %r9, %r4, %r8;
+; CHECK-NEXT:    min.f32 %r10, %r2, %r6;
+; CHECK-NEXT:    min.f32 %r11, %r10, %r9;
+; CHECK-NEXT:    min.f32 %r12, %r3, %r7;
+; CHECK-NEXT:    min.f32 %r13, %r1, %r5;
+; CHECK-NEXT:    min.f32 %r14, %r13, %r12;
+; CHECK-NEXT:    min.f32 %r15, %r14, %r11;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r15;
 ; CHECK-NEXT:    ret;
   %res = call float @llvm.vector.reduce.fmin(<8 x float> %in)
   ret float %res
@@ -558,19 +558,19 @@ define float @reduce_fmin_float_reassoc(<8 x float> %in) {
 ;
 ; CHECK-LABEL: reduce_fmin_float_reassoc(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<16>;
+; CHECK-NEXT:    .reg .b32 %r<16>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b32 {%f5, %f6, %f7, %f8}, [reduce_fmin_float_reassoc_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fmin_float_reassoc_param_0];
-; CHECK-NEXT:    min.f32 %f9, %f4, %f8;
-; CHECK-NEXT:    min.f32 %f10, %f2, %f6;
-; CHECK-NEXT:    min.f32 %f11, %f10, %f9;
-; CHECK-NEXT:    min.f32 %f12, %f3, %f7;
-; CHECK-NEXT:    min.f32 %f13, %f1, %f5;
-; CHECK-NEXT:    min.f32 %f14, %f13, %f12;
-; CHECK-NEXT:    min.f32 %f15, %f14, %f11;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f15;
+; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_reassoc_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_reassoc_param_0];
+; CHECK-NEXT:    min.f32 %r9, %r4, %r8;
+; CHECK-NEXT:    min.f32 %r10, %r2, %r6;
+; CHECK-NEXT:    min.f32 %r11, %r10, %r9;
+; CHECK-NEXT:    min.f32 %r12, %r3, %r7;
+; CHECK-NEXT:    min.f32 %r13, %r1, %r5;
+; CHECK-NEXT:    min.f32 %r14, %r13, %r12;
+; CHECK-NEXT:    min.f32 %r15, %r14, %r11;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r15;
 ; CHECK-NEXT:    ret;
   %res = call reassoc float @llvm.vector.reduce.fmin(<8 x float> %in)
   ret float %res
@@ -580,19 +580,19 @@ define float @reduce_fmin_float_reassoc_nonpow2(<7 x float> %in) {
 ;
 ; CHECK-LABEL: reduce_fmin_float_reassoc_nonpow2(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<14>;
+; CHECK-NEXT:    .reg .b32 %r<14>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f7, [reduce_fmin_float_reassoc_nonpow2_param_0+24];
-; CHECK-NEXT:    ld.param.v2.b32 {%f5, %f6}, [reduce_fmin_float_reassoc_nonpow2_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fmin_float_reassoc_nonpow2_param_0];
-; CHECK-NEXT:    min.f32 %f8, %f3, %f7;
-; CHECK-NEXT:    min.f32 %f9, %f1, %f5;
-; CHECK-NEXT:    min.f32 %f10, %f9, %f8;
-; CHECK-NEXT:    min.f32 %f11, %f2, %f6;
-; CHECK-NEXT:    min.f32 %f12, %f11, %f4;
-; CHECK-NEXT:    min.f32 %f13, %f10, %f12;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f13;
+; CHECK-NEXT:    ld.param.b32 %r7, [reduce_fmin_float_reassoc_nonpow2_param_0+24];
+; CHECK-NEXT:    ld.param.v2.b32 {%r5, %r6}, [reduce_fmin_float_reassoc_nonpow2_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_reassoc_nonpow2_param_0];
+; CHECK-NEXT:    min.f32 %r8, %r3, %r7;
+; CHECK-NEXT:    min.f32 %r9, %r1, %r5;
+; CHECK-NEXT:    min.f32 %r10, %r9, %r8;
+; CHECK-NEXT:    min.f32 %r11, %r2, %r6;
+; CHECK-NEXT:    min.f32 %r12, %r11, %r4;
+; CHECK-NEXT:    min.f32 %r13, %r10, %r12;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r13;
 ; CHECK-NEXT:    ret;
   %res = call reassoc float @llvm.vector.reduce.fmin(<7 x float> %in)
   ret float %res
@@ -668,19 +668,19 @@ define float @reduce_fmaximum_float(<8 x float> %in) {
 ;
 ; CHECK-LABEL: reduce_fmaximum_float(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<16>;
+; CHECK-NEXT:    .reg .b32 %r<16>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b32 {%f5, %f6, %f7, %f8}, [reduce_fmaximum_float_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fmaximum_float_param_0];
-; CHECK-NEXT:    max.NaN.f32 %f9, %f4, %f8;
-; CHECK-NEXT:    max.NaN.f32 %f10, %f2, %f6;
-; CHECK-NEXT:    max.NaN.f32 %f11, %f10, %f9;
-; CHECK-NEXT:    max.NaN.f32 %f12, %f3, %f7;
-; CHECK-NEXT:    max.NaN.f32 %f13, %f1, %f5;
-; CHECK-NEXT:    max.NaN.f32 %f14, %f13, %f12;
-; CHECK-NEXT:    max.NaN.f32 %f15, %f14, %f11;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f15;
+; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmaximum_float_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_param_0];
+; CHECK-NEXT:    max.NaN.f32 %r9, %r4, %r8;
+; CHECK-NEXT:    max.NaN.f32 %r10, %r2, %r6;
+; CHECK-NEXT:    max.NaN.f32 %r11, %r10, %r9;
+; CHECK-NEXT:    max.NaN.f32 %r12, %r3, %r7;
+; CHECK-NEXT:    max.NaN.f32 %r13, %r1, %r5;
+; CHECK-NEXT:    max.NaN.f32 %r14, %r13, %r12;
+; CHECK-NEXT:    max.NaN.f32 %r15, %r14, %r11;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r15;
 ; CHECK-NEXT:    ret;
   %res = call float @llvm.vector.reduce.fmaximum(<8 x float> %in)
   ret float %res
@@ -690,19 +690,19 @@ define float @reduce_fmaximum_float_reassoc(<8 x float> %in) {
 ;
 ; CHECK-LABEL: reduce_fmaximum_float_reassoc(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<16>;
+; CHECK-NEXT:    .reg .b32 %r<16>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b32 {%f5, %f6, %f7, %f8}, [reduce_fmaximum_float_reassoc_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fmaximum_float_reassoc_param_0];
-; CHECK-NEXT:    max.NaN.f32 %f9, %f4, %f8;
-; CHECK-NEXT:    max.NaN.f32 %f10, %f2, %f6;
-; CHECK-NEXT:    max.NaN.f32 %f11, %f10, %f9;
-; CHECK-NEXT:    max.NaN.f32 %f12, %f3, %f7;
-; CHECK-NEXT:    max.NaN.f32 %f13, %f1, %f5;
-; CHECK-NEXT:    max.NaN.f32 %f14, %f13, %f12;
-; CHECK-NEXT:    max.NaN.f32 %f15, %f14, %f11;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f15;
+; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmaximum_float_reassoc_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_reassoc_param_0];
+; CHECK-NEXT:    max.NaN.f32 %r9, %r4, %r8;
+; CHECK-NEXT:    max.NaN.f32 %r10, %r2, %r6;
+; CHECK-NEXT:    max.NaN.f32 %r11, %r10, %r9;
+; CHECK-NEXT:    max.NaN.f32 %r12, %r3, %r7;
+; CHECK-NEXT:    max.NaN.f32 %r13, %r1, %r5;
+; CHECK-NEXT:    max.NaN.f32 %r14, %r13, %r12;
+; CHECK-NEXT:    max.NaN.f32 %r15, %r14, %r11;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r15;
 ; CHECK-NEXT:    ret;
   %res = call reassoc float @llvm.vector.reduce.fmaximum(<8 x float> %in)
   ret float %res
@@ -712,19 +712,19 @@ define float @reduce_fmaximum_float_reassoc_nonpow2(<7 x float> %in) {
 ;
 ; CHECK-LABEL: reduce_fmaximum_float_reassoc_nonpow2(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<14>;
+; CHECK-NEXT:    .reg .b32 %r<14>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f7, [reduce_fmaximum_float_reassoc_nonpow2_param_0+24];
-; CHECK-NEXT:    ld.param.v2.b32 {%f5, %f6}, [reduce_fmaximum_float_reassoc_nonpow2_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fmaximum_float_reassoc_nonpow2_param_0];
-; CHECK-NEXT:    max.NaN.f32 %f8, %f3, %f7;
-; CHECK-NEXT:    max.NaN.f32 %f9, %f1, %f5;
-; CHECK-NEXT:    max.NaN.f32 %f10, %f9, %f8;
-; CHECK-NEXT:    max.NaN.f32 %f11, %f2, %f6;
-; CHECK-NEXT:    max.NaN.f32 %f12, %f11, %f4;
-; CHECK-NEXT:    max.NaN.f32 %f13, %f10, %f12;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f13;
+; CHECK-NEXT:    ld.param.b32 %r7, [reduce_fmaximum_float_reassoc_nonpow2_param_0+24];
+; CHECK-NEXT:    ld.param.v2.b32 {%r5, %r6}, [reduce_fmaximum_float_reassoc_nonpow2_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_reassoc_nonpow2_param_0];
+; CHECK-NEXT:    max.NaN.f32 %r8, %r3, %r7;
+; CHECK-NEXT:    max.NaN.f32 %r9, %r1, %r5;
+; CHECK-NEXT:    max.NaN.f32 %r10, %r9, %r8;
+; CHECK-NEXT:    max.NaN.f32 %r11, %r2, %r6;
+; CHECK-NEXT:    max.NaN.f32 %r12, %r11, %r4;
+; CHECK-NEXT:    max.NaN.f32 %r13, %r10, %r12;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r13;
 ; CHECK-NEXT:    ret;
   %res = call reassoc float @llvm.vector.reduce.fmaximum(<7 x float> %in)
   ret float %res
@@ -800,19 +800,19 @@ define float @reduce_fminimum_float(<8 x float> %in) {
 ;
 ; CHECK-LABEL: reduce_fminimum_float(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<16>;
+; CHECK-NEXT:    .reg .b32 %r<16>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b32 {%f5, %f6, %f7, %f8}, [reduce_fminimum_float_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fminimum_float_param_0];
-; CHECK-NEXT:    min.NaN.f32 %f9, %f4, %f8;
-; CHECK-NEXT:    min.NaN.f32 %f10, %f2, %f6;
-; CHECK-NEXT:    min.NaN.f32 %f11, %f10, %f9;
-; CHECK-NEXT:    min.NaN.f32 %f12, %f3, %f7;
-; CHECK-NEXT:    min.NaN.f32 %f13, %f1, %f5;
-; CHECK-NEXT:    min.NaN.f32 %f14, %f13, %f12;
-; CHECK-NEXT:    min.NaN.f32 %f15, %f14, %f11;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f15;
+; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fminimum_float_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_param_0];
+; CHECK-NEXT:    min.NaN.f32 %r9, %r4, %r8;
+; CHECK-NEXT:    min.NaN.f32 %r10, %r2, %r6;
+; CHECK-NEXT:    min.NaN.f32 %r11, %r10, %r9;
+; CHECK-NEXT:    min.NaN.f32 %r12, %r3, %r7;
+; CHECK-NEXT:    min.NaN.f32 %r13, %r1, %r5;
+; CHECK-NEXT:    min.NaN.f32 %r14, %r13, %r12;
+; CHECK-NEXT:    min.NaN.f32 %r15, %r14, %r11;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r15;
 ; CHECK-NEXT:    ret;
   %res = call float @llvm.vector.reduce.fminimum(<8 x float> %in)
   ret float %res
@@ -822,19 +822,19 @@ define float @reduce_fminimum_float_reassoc(<8 x float> %in) {
 ;
 ; CHECK-LABEL: reduce_fminimum_float_reassoc(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<16>;
+; CHECK-NEXT:    .reg .b32 %r<16>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b32 {%f5, %f6, %f7, %f8}, [reduce_fminimum_float_reassoc_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fminimum_float_reassoc_param_0];
-; CHECK-NEXT:    min.NaN.f32 %f9, %f4, %f8;
-; CHECK-NEXT:    min.NaN.f32 %f10, %f2, %f6;
-; CHECK-NEXT:    min.NaN.f32 %f11, %f10, %f9;
-; CHECK-NEXT:    min.NaN.f32 %f12, %f3, %f7;
-; CHECK-NEXT:    min.NaN.f32 %f13, %f1, %f5;
-; CHECK-NEXT:    min.NaN.f32 %f14, %f13, %f12;
-; CHECK-NEXT:    min.NaN.f32 %f15, %f14, %f11;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f15;
+; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fminimum_float_reassoc_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_reassoc_param_0];
+; CHECK-NEXT:    min.NaN.f32 %r9, %r4, %r8;
+; CHECK-NEXT:    min.NaN.f32 %r10, %r2, %r6;
+; CHECK-NEXT:    min.NaN.f32 %r11, %r10, %r9;
+; CHECK-NEXT:    min.NaN.f32 %r12, %r3, %r7;
+; CHECK-NEXT:    min.NaN.f32 %r13, %r1, %r5;
+; CHECK-NEXT:    min.NaN.f32 %r14, %r13, %r12;
+; CHECK-NEXT:    min.NaN.f32 %r15, %r14, %r11;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r15;
 ; CHECK-NEXT:    ret;
   %res = call reassoc float @llvm.vector.reduce.fminimum(<8 x float> %in)
   ret float %res
@@ -844,19 +844,19 @@ define float @reduce_fminimum_float_reassoc_nonpow2(<7 x float> %in) {
 ;
 ; CHECK-LABEL: reduce_fminimum_float_reassoc_nonpow2(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<14>;
+; CHECK-NEXT:    .reg .b32 %r<14>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f7, [reduce_fminimum_float_reassoc_nonpow2_param_0+24];
-; CHECK-NEXT:    ld.param.v2.b32 {%f5, %f6}, [reduce_fminimum_float_reassoc_nonpow2_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fminimum_float_reassoc_nonpow2_param_0];
-; CHECK-NEXT:    min.NaN.f32 %f8, %f3, %f7;
-; CHECK-NEXT:    min.NaN.f32 %f9, %f1, %f5;
-; CHECK-NEXT:    min.NaN.f32 %f10, %f9, %f8;
-; CHECK-NEXT:    min.NaN.f32 %f11, %f2, %f6;
-; CHECK-NEXT:    min.NaN.f32 %f12, %f11, %f4;
-; CHECK-NEXT:    min.NaN.f32 %f13, %f10, %f12;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f13;
+; CHECK-NEXT:    ld.param.b32 %r7, [reduce_fminimum_float_reassoc_nonpow2_param_0+24];
+; CHECK-NEXT:    ld.param.v2.b32 {%r5, %r6}, [reduce_fminimum_float_reassoc_nonpow2_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_reassoc_nonpow2_param_0];
+; CHECK-NEXT:    min.NaN.f32 %r8, %r3, %r7;
+; CHECK-NEXT:    min.NaN.f32 %r9, %r1, %r5;
+; CHECK-NEXT:    min.NaN.f32 %r10, %r9, %r8;
+; CHECK-NEXT:    min.NaN.f32 %r11, %r2, %r6;
+; CHECK-NEXT:    min.NaN.f32 %r12, %r11, %r4;
+; CHECK-NEXT:    min.NaN.f32 %r13, %r10, %r12;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r13;
 ; CHECK-NEXT:    ret;
   %res = call reassoc float @llvm.vector.reduce.fminimum(<7 x float> %in)
   ret float %res
diff --git a/llvm/test/CodeGen/NVPTX/redux-sync-f32.ll b/llvm/test/CodeGen/NVPTX/redux-sync-f32.ll
index dbc10757dc43b..7c9487b33854b 100644
--- a/llvm/test/CodeGen/NVPTX/redux-sync-f32.ll
+++ b/llvm/test/CodeGen/NVPTX/redux-sync-f32.ll
@@ -6,14 +6,13 @@ declare float @llvm.nvvm.redux.sync.fmin(float, i32)
 define float @redux_sync_fmin(float %src, i32 %mask) {
 ; CHECK-LABEL: redux_sync_fmin(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [redux_sync_fmin_param_0];
-; CHECK-NEXT:    ld.param.b32 %r1, [redux_sync_fmin_param_1];
-; CHECK-NEXT:    redux.sync.min.f32 %f2, %f1, %r1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-NEXT:    ld.param.b32 %r1, [redux_sync_fmin_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [redux_sync_fmin_param_1];
+; CHECK-NEXT:    redux.sync.min.f32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
   %val = call float @llvm.nvvm.redux.sync.fmin(float %src, i32 %mask)
   ret float %val
@@ -23,14 +22,13 @@ declare float @llvm.nvvm.redux.sync.fmin.abs(float, i32)
 define float @redux_sync_fmin_abs(float %src, i32 %mask) {
 ; CHECK-LABEL: redux_sync_fmin_abs(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [redux_sync_fmin_abs_param_0];
-; CHECK-NEXT:    ld.param.b32 %r1, [redux_sync_fmin_abs_param_1];
-; CHECK-NEXT:    redux.sync.min.abs.f32 %f2, %f1, %r1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-NEXT:    ld.param.b32 %r1, [redux_sync_fmin_abs_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [redux_sync_fmin_abs_param_1];
+; CHECK-NEXT:    redux.sync.min.abs.f32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
   %val = call float @llvm.nvvm.redux.sync.fmin.abs(float %src, i32 %mask)
   ret float %val
@@ -40,14 +38,13 @@ declare float @llvm.nvvm.redux.sync.fmin.NaN(float, i32)
 define float @redux_sync_fmin_NaN(float %src, i32 %mask) {
 ; CHECK-LABEL: redux_sync_fmin_NaN(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [redux_sync_fmin_NaN_param_0];
-; CHECK-NEXT:    ld.param.b32 %r1, [redux_sync_fmin_NaN_param_1];
-; CHECK-NEXT:    redux.sync.min.NaN.f32 %f2, %f1, %r1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-NEXT:    ld.param.b32 %r1, [redux_sync_fmin_NaN_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [redux_sync_fmin_NaN_param_1];
+; CHECK-NEXT:    redux.sync.min.NaN.f32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
   %val = call float @llvm.nvvm.redux.sync.fmin.NaN(float %src, i32 %mask)
   ret float %val
@@ -57,14 +54,13 @@ declare float @llvm.nvvm.redux.sync.fmin.abs.NaN(float, i32)
 define float @redux_sync_fmin_abs_NaN(float %src, i32 %mask) {
 ; CHECK-LABEL: redux_sync_fmin_abs_NaN(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [redux_sync_fmin_abs_NaN_param_0];
-; CHECK-NEXT:    ld.param.b32 %r1, [redux_sync_fmin_abs_NaN_param_1];
-; CHECK-NEXT:    redux.sync.min.abs.NaN.f32 %f2, %f1, %r1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-NEXT:    ld.param.b32 %r1, [redux_sync_fmin_abs_NaN_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [redux_sync_fmin_abs_NaN_param_1];
+; CHECK-NEXT:    redux.sync.min.abs.NaN.f32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
   %val = call float @llvm.nvvm.redux.sync.fmin.abs.NaN(float %src, i32 %mask)
   ret float %val
@@ -74,14 +70,13 @@ declare float @llvm.nvvm.redux.sync.fmax(float, i32)
 define float @redux_sync_fmax(float %src, i32 %mask) {
 ; CHECK-LABEL: redux_sync_fmax(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [redux_sync_fmax_param_0];
-; CHECK-NEXT:    ld.param.b32 %r1, [redux_sync_fmax_param_1];
-; CHECK-NEXT:    redux.sync.max.f32 %f2, %f1, %r1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-NEXT:    ld.param.b32 %r1, [redux_sync_fmax_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [redux_sync_fmax_param_1];
+; CHECK-NEXT:    redux.sync.max.f32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
   %val = call float @llvm.nvvm.redux.sync.fmax(float %src, i32 %mask)
   ret float %val
@@ -91,14 +86,13 @@ declare float @llvm.nvvm.redux.sync.fmax.abs(float, i32)
 define float @redux_sync_fmax_abs(float %src, i32 %mask) {
 ; CHECK-LABEL: redux_sync_fmax_abs(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [redux_sync_fmax_abs_param_0];
-; CHECK-NEXT:    ld.param.b32 %r1, [redux_sync_fmax_abs_param_1];
-; CHECK-NEXT:    redux.sync.max.abs.f32 %f2, %f1, %r1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-NEXT:    ld.param.b32 %r1, [redux_sync_fmax_abs_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [redux_sync_fmax_abs_param_1];
+; CHECK-NEXT:    redux.sync.max.abs.f32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
   %val = call float @llvm.nvvm.redux.sync.fmax.abs(float %src, i32 %mask)
   ret float %val
@@ -108,14 +102,13 @@ declare float @llvm.nvvm.redux.sync.fmax.NaN(float, i32)
 define float @redux_sync_fmax_NaN(float %src, i32 %mask) {
 ; CHECK-LABEL: redux_sync_fmax_NaN(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [redux_sync_fmax_NaN_param_0];
-; CHECK-NEXT:    ld.param.b32 %r1, [redux_sync_fmax_NaN_param_1];
-; CHECK-NEXT:    redux.sync.max.NaN.f32 %f2, %f1, %r1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-NEXT:    ld.param.b32 %r1, [redux_sync_fmax_NaN_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [redux_sync_fmax_NaN_param_1];
+; CHECK-NEXT:    redux.sync.max.NaN.f32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
   %val = call float @llvm.nvvm.redux.sync.fmax.NaN(float %src, i32 %mask)
   ret float %val
@@ -125,14 +118,13 @@ declare float @llvm.nvvm.redux.sync.fmax.abs.NaN(float, i32)
 define float @redux_sync_fmax_abs_NaN(float %src, i32 %mask) {
 ; CHECK-LABEL: redux_sync_fmax_abs_NaN(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [redux_sync_fmax_abs_NaN_param_0];
-; CHECK-NEXT:    ld.param.b32 %r1, [redux_sync_fmax_abs_NaN_param_1];
-; CHECK-NEXT:    redux.sync.max.abs.NaN.f32 %f2, %f1, %r1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-NEXT:    ld.param.b32 %r1, [redux_sync_fmax_abs_NaN_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [redux_sync_fmax_abs_NaN_param_1];
+; CHECK-NEXT:    redux.sync.max.abs.NaN.f32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
   %val = call float @llvm.nvvm.redux.sync.fmax.abs.NaN(float %src, i32 %mask)
   ret float %val
diff --git a/llvm/test/CodeGen/NVPTX/reg-types.ll b/llvm/test/CodeGen/NVPTX/reg-types.ll
index 4b50bca7f8efe..fb065e1b01bbe 100644
--- a/llvm/test/CodeGen/NVPTX/reg-types.ll
+++ b/llvm/test/CodeGen/NVPTX/reg-types.ll
@@ -25,9 +25,7 @@ entry:
   %u64 = alloca i64, align 8
 ; CHECK-DAG: .reg .b64 %rd<
   %f32 = alloca float, align 4
-; CHECK-DAG: .reg .b32 %f<
   %f64 = alloca double, align 8
-; CHECK-DAG: .reg .b64 %fd<
 
 ; Verify that we use correct register types.
   store i8 1, ptr %s8, align 1
@@ -61,9 +59,9 @@ entry:
   store double 1.000000e+01, ptr %f64, align 8
 ; Instead, we force a load into a register and then verify register type.
   %f32v = load volatile float, ptr %f32, align 4
-; CHECK: ld.volatile.b32         %f{{[0-9]+}}
+; CHECK: ld.volatile.b32         %r{{[0-9]+}}
   %f64v = load volatile double, ptr %f64, align 8
-; CHECK: ld.volatile.b64         %fd{{[0-9]+}}
+; CHECK: ld.volatile.b64         %rd{{[0-9]+}}
   ret void
 ; CHECK: ret;
 ; NO8BIT: ret;
diff --git a/llvm/test/CodeGen/NVPTX/shfl-p.ll b/llvm/test/CodeGen/NVPTX/shfl-p.ll
index 756998196fdec..678fde8658664 100644
--- a/llvm/test/CodeGen/NVPTX/shfl-p.ll
+++ b/llvm/test/CodeGen/NVPTX/shfl-p.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_30 | FileCheck %s
 ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 | %ptxas-verify %}
 
@@ -10,164 +11,308 @@ declare {float, i1} @llvm.nvvm.shfl.bfly.f32p(float, i32, i32)
 declare {i32, i1} @llvm.nvvm.shfl.idx.i32p(i32, i32, i32)
 declare {float, i1} @llvm.nvvm.shfl.idx.f32p(float, i32, i32)
 
-; CHECK-LABEL: .func{{.*}}shfl_i32_rrr
 define {i32, i1} @shfl_i32_rrr(i32 %a, i32 %b, i32 %c) {
-  ; CHECK: ld.param.b32 [[A:%r[0-9]+]]
-  ; CHECK: ld.param.b32 [[B:%r[0-9]+]]
-  ; CHECK: ld.param.b32 [[C:%r[0-9]+]]
-  ; CHECK: shfl.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], [[C]];
-  ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
+; CHECK-LABEL: shfl_i32_rrr(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [shfl_i32_rrr_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [shfl_i32_rrr_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [shfl_i32_rrr_param_2];
+; CHECK-NEXT:    shfl.down.b32 %r4|%p1, %r1, %r2, %r3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+4], %rs1;
+; CHECK-NEXT:    ret;
   %val = call {i32, i1} @llvm.nvvm.shfl.down.i32p(i32 %a, i32 %b, i32 %c)
   ret {i32, i1} %val
 }
 
-; CHECK-LABEL: .func{{.*}}shfl_i32_irr
 define {i32, i1} @shfl_i32_irr(i32 %a, i32 %b, i32 %c) {
-  ; CHECK: ld.param.b32 [[A:%r[0-9]+]]
-  ; CHECK: ld.param.b32 [[B:%r[0-9]+]]
-  ; CHECK: ld.param.b32 [[C:%r[0-9]+]]
-  ; CHECK: shfl.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], [[C]];
-  ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
+; CHECK-LABEL: shfl_i32_irr(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [shfl_i32_irr_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [shfl_i32_irr_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [shfl_i32_irr_param_2];
+; CHECK-NEXT:    shfl.down.b32 %r4|%p1, %r1, %r2, %r3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+4], %rs1;
+; CHECK-NEXT:    ret;
   %val = call {i32, i1} @llvm.nvvm.shfl.down.i32p(i32 %a, i32 %b, i32 %c)
   ret {i32, i1} %val
 }
 
-; CHECK-LABEL: .func{{.*}}shfl_i32_rri
 define {i32, i1} @shfl_i32_rri(i32 %a, i32 %b) {
-  ; CHECK: ld.param.b32 [[A:%r[0-9]+]]
-  ; CHECK: ld.param.b32 [[B:%r[0-9]+]]
-  ; CHECK: shfl.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], 1;
-  ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
+; CHECK-LABEL: shfl_i32_rri(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [shfl_i32_rri_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [shfl_i32_rri_param_1];
+; CHECK-NEXT:    shfl.down.b32 %r3|%p1, %r1, %r2, 1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+4], %rs1;
+; CHECK-NEXT:    ret;
   %val = call {i32, i1} @llvm.nvvm.shfl.down.i32p(i32 %a, i32 %b, i32 1)
   ret {i32, i1} %val
 }
 
-; CHECK-LABEL: .func{{.*}}shfl_i32_iri
 define {i32, i1} @shfl_i32_iri(i32 %a, i32 %b) {
-  ; CHECK: ld.param.b32 [[A:%r[0-9]+]]
-  ; CHECK: ld.param.b32 [[B:%r[0-9]+]]
-  ; CHECK: shfl.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], 2;
-  ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
+; CHECK-LABEL: shfl_i32_iri(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [shfl_i32_iri_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [shfl_i32_iri_param_1];
+; CHECK-NEXT:    shfl.down.b32 %r3|%p1, %r1, %r2, 2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+4], %rs1;
+; CHECK-NEXT:    ret;
   %val = call {i32, i1} @llvm.nvvm.shfl.down.i32p(i32 %a, i32 %b, i32 2)
   ret {i32, i1} %val
 }
 
-; CHECK-LABEL: .func{{.*}}shfl_i32_rir
 define {i32, i1} @shfl_i32_rir(i32 %a, i32 %c) {
-  ; CHECK: ld.param.b32 [[A:%r[0-9]+]]
-  ; CHECK: ld.param.b32 [[C:%r[0-9]+]]
-  ; CHECK: shfl.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 1, [[C]];
-  ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
+; CHECK-LABEL: shfl_i32_rir(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [shfl_i32_rir_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [shfl_i32_rir_param_1];
+; CHECK-NEXT:    shfl.down.b32 %r3|%p1, %r1, 1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+4], %rs1;
+; CHECK-NEXT:    ret;
   %val = call {i32, i1} @llvm.nvvm.shfl.down.i32p(i32 %a, i32 1, i32 %c)
   ret {i32, i1} %val
 }
 
-; CHECK-LABEL: .func{{.*}}shfl_i32_iir
 define {i32, i1} @shfl_i32_iir(i32 %a, i32 %c) {
-  ; CHECK: ld.param.b32 [[A:%r[0-9]+]]
-  ; CHECK: ld.param.b32 [[C:%r[0-9]+]]
-  ; CHECK: shfl.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 2, [[C]];
-  ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
+; CHECK-LABEL: shfl_i32_iir(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [shfl_i32_iir_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [shfl_i32_iir_param_1];
+; CHECK-NEXT:    shfl.down.b32 %r3|%p1, %r1, 2, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+4], %rs1;
+; CHECK-NEXT:    ret;
   %val = call {i32, i1} @llvm.nvvm.shfl.down.i32p(i32 %a, i32 2, i32 %c)
   ret {i32, i1} %val
 }
 
-; CHECK-LABEL: .func{{.*}}shfl_i32_rii
 define {i32, i1} @shfl_i32_rii(i32 %a) {
-  ; CHECK: ld.param.b32 [[A:%r[0-9]+]]
-  ; CHECK: shfl.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 1, 2;
-  ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
+; CHECK-LABEL: shfl_i32_rii(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [shfl_i32_rii_param_0];
+; CHECK-NEXT:    shfl.down.b32 %r2|%p1, %r1, 1, 2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+4], %rs1;
+; CHECK-NEXT:    ret;
   %val = call {i32, i1} @llvm.nvvm.shfl.down.i32p(i32 %a, i32 1, i32 2)
   ret {i32, i1} %val
 }
 
-; CHECK-LABEL: .func{{.*}}shfl_i32_iii
 define {i32, i1} @shfl_i32_iii(i32 %a, i32 %b) {
-  ; CHECK: ld.param.b32 [[A:%r[0-9]+]]
-  ; CHECK: shfl.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 2, 3;
-  ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
+; CHECK-LABEL: shfl_i32_iii(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [shfl_i32_iii_param_0];
+; CHECK-NEXT:    shfl.down.b32 %r2|%p1, %r1, 2, 3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+4], %rs1;
+; CHECK-NEXT:    ret;
   %val = call {i32, i1} @llvm.nvvm.shfl.down.i32p(i32 %a, i32 2, i32 3)
   ret {i32, i1} %val
 }
 
 ;; Same intrinsics, but for float
 
-; CHECK-LABEL: .func{{.*}}shfl_f32_rrr
 define {float, i1} @shfl_f32_rrr(float %a, i32 %b, i32 %c) {
-  ; CHECK: ld.param.b32 [[A:%f[0-9]+]]
-  ; CHECK: ld.param.b32 [[B:%r[0-9]+]]
-  ; CHECK: ld.param.b32 [[C:%r[0-9]+]]
-  ; CHECK: shfl.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], [[C]];
-  ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
+; CHECK-LABEL: shfl_f32_rrr(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [shfl_f32_rrr_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [shfl_f32_rrr_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [shfl_f32_rrr_param_2];
+; CHECK-NEXT:    shfl.down.b32 %r4|%p1, %r1, %r2, %r3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+4], %rs1;
+; CHECK-NEXT:    ret;
   %val = call {float, i1} @llvm.nvvm.shfl.down.f32p(float %a, i32 %b, i32 %c)
   ret {float, i1} %val
 }
 
-; CHECK-LABEL: .func{{.*}}shfl_f32_irr
 define {float, i1} @shfl_f32_irr(float %a, i32 %b, i32 %c) {
-  ; CHECK: ld.param.b32 [[A:%f[0-9]+]]
-  ; CHECK: ld.param.b32 [[B:%r[0-9]+]]
-  ; CHECK: ld.param.b32 [[C:%r[0-9]+]]
-  ; CHECK: shfl.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], [[C]];
-  ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
+; CHECK-LABEL: shfl_f32_irr(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [shfl_f32_irr_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [shfl_f32_irr_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [shfl_f32_irr_param_2];
+; CHECK-NEXT:    shfl.down.b32 %r4|%p1, %r1, %r2, %r3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+4], %rs1;
+; CHECK-NEXT:    ret;
   %val = call {float, i1} @llvm.nvvm.shfl.down.f32p(float %a, i32 %b, i32 %c)
   ret {float, i1} %val
 }
 
-; CHECK-LABEL: .func{{.*}}shfl_f32_rri
 define {float, i1} @shfl_f32_rri(float %a, i32 %b) {
-  ; CHECK: ld.param.b32 [[A:%f[0-9]+]]
-  ; CHECK: ld.param.b32 [[B:%r[0-9]+]]
-  ; CHECK: shfl.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], 1;
-  ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
+; CHECK-LABEL: shfl_f32_rri(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [shfl_f32_rri_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [shfl_f32_rri_param_1];
+; CHECK-NEXT:    shfl.down.b32 %r3|%p1, %r1, %r2, 1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+4], %rs1;
+; CHECK-NEXT:    ret;
   %val = call {float, i1} @llvm.nvvm.shfl.down.f32p(float %a, i32 %b, i32 1)
   ret {float, i1} %val
 }
 
-; CHECK-LABEL: .func{{.*}}shfl_f32_iri
 define {float, i1} @shfl_f32_iri(float %a, i32 %b) {
-  ; CHECK: ld.param.b32 [[A:%f[0-9]+]]
-  ; CHECK: ld.param.b32 [[B:%r[0-9]+]]
-  ; CHECK: shfl.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], 2;
-  ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
+; CHECK-LABEL: shfl_f32_iri(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [shfl_f32_iri_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [shfl_f32_iri_param_1];
+; CHECK-NEXT:    shfl.down.b32 %r3|%p1, %r1, %r2, 2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+4], %rs1;
+; CHECK-NEXT:    ret;
   %val = call {float, i1} @llvm.nvvm.shfl.down.f32p(float %a, i32 %b, i32 2)
   ret {float, i1} %val
 }
 
-; CHECK-LABEL: .func{{.*}}shfl_f32_rir
 define {float, i1} @shfl_f32_rir(float %a, i32 %c) {
-  ; CHECK: ld.param.b32 [[A:%f[0-9]+]]
-  ; CHECK: ld.param.b32 [[C:%r[0-9]+]]
-  ; CHECK: shfl.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 1, [[C]];
-  ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
+; CHECK-LABEL: shfl_f32_rir(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [shfl_f32_rir_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [shfl_f32_rir_param_1];
+; CHECK-NEXT:    shfl.down.b32 %r3|%p1, %r1, 1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+4], %rs1;
+; CHECK-NEXT:    ret;
   %val = call {float, i1} @llvm.nvvm.shfl.down.f32p(float %a, i32 1, i32 %c)
   ret {float, i1} %val
 }
 
-; CHECK-LABEL: .func{{.*}}shfl_f32_iir
 define {float, i1} @shfl_f32_iir(float %a, i32 %c) {
-  ; CHECK: ld.param.b32 [[A:%f[0-9]+]]
-  ; CHECK: ld.param.b32 [[C:%r[0-9]+]]
-  ; CHECK: shfl.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 2, [[C]];
-  ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
+; CHECK-LABEL: shfl_f32_iir(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [shfl_f32_iir_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [shfl_f32_iir_param_1];
+; CHECK-NEXT:    shfl.down.b32 %r3|%p1, %r1, 2, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+4], %rs1;
+; CHECK-NEXT:    ret;
   %val = call {float, i1} @llvm.nvvm.shfl.down.f32p(float %a, i32 2, i32 %c)
   ret {float, i1} %val
 }
 
-; CHECK-LABEL: .func{{.*}}shfl_f32_rii
 define {float, i1} @shfl_f32_rii(float %a) {
-  ; CHECK: ld.param.b32 [[A:%f[0-9]+]]
-  ; CHECK: shfl.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 1, 2;
-  ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
+; CHECK-LABEL: shfl_f32_rii(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [shfl_f32_rii_param_0];
+; CHECK-NEXT:    shfl.down.b32 %r2|%p1, %r1, 1, 2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+4], %rs1;
+; CHECK-NEXT:    ret;
   %val = call {float, i1} @llvm.nvvm.shfl.down.f32p(float %a, i32 1, i32 2)
   ret {float, i1} %val
 }
 
-; CHECK-LABEL: .func{{.*}}shfl_f32_iii
 define {float, i1} @shfl_f32_iii(float %a, i32 %b) {
-  ; CHECK: ld.param.b32 [[A:%f[0-9]+]]
-  ; CHECK: shfl.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 2, 3;
-  ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
+; CHECK-LABEL: shfl_f32_iii(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [shfl_f32_iii_param_0];
+; CHECK-NEXT:    shfl.down.b32 %r2|%p1, %r1, 2, 3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+4], %rs1;
+; CHECK-NEXT:    ret;
   %val = call {float, i1} @llvm.nvvm.shfl.down.f32p(float %a, i32 2, i32 3)
   ret {float, i1} %val
 }
diff --git a/llvm/test/CodeGen/NVPTX/shfl-sync-p.ll b/llvm/test/CodeGen/NVPTX/shfl-sync-p.ll
index 74890dc4fed20..9cf3a1dc107c1 100644
--- a/llvm/test/CodeGen/NVPTX/shfl-sync-p.ll
+++ b/llvm/test/CodeGen/NVPTX/shfl-sync-p.ll
@@ -99,10 +99,10 @@ define {i32, i1} @shfl_sync_i32_iii(i32 %a, i32 %b) {
 ; CHECK-LABEL: .func{{.*}}shfl_sync_f32_rrr
 define {float, i1} @shfl_sync_f32_rrr(i32 %mask, float %a, i32 %b, i32 %c) {
   ; CHECK: ld.param.b32 [[MASK:%r[0-9]+]]
-  ; CHECK: ld.param.b32 [[A:%f[0-9]+]]
+  ; CHECK: ld.param.b32 [[A:%r[0-9]+]]
   ; CHECK: ld.param.b32 [[B:%r[0-9]+]]
   ; CHECK: ld.param.b32 [[C:%r[0-9]+]]
-  ; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], [[C]], [[MASK]];
+  ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], [[C]], [[MASK]];
   ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
   %val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 %mask, float %a, i32 %b, i32 %c)
   ret {float, i1} %val
@@ -110,10 +110,10 @@ define {float, i1} @shfl_sync_f32_rrr(i32 %mask, float %a, i32 %b, i32 %c) {
 
 ; CHECK-LABEL: .func{{.*}}shfl_sync_f32_irr
 define {float, i1} @shfl_sync_f32_irr(float %a, i32 %b, i32 %c) {
-  ; CHECK: ld.param.b32 [[A:%f[0-9]+]]
+  ; CHECK: ld.param.b32 [[A:%r[0-9]+]]
   ; CHECK: ld.param.b32 [[B:%r[0-9]+]]
   ; CHECK: ld.param.b32 [[C:%r[0-9]+]]
-  ; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], [[C]], 1;
+  ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], [[C]], 1;
   ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
   %val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 1, float %a, i32 %b, i32 %c)
   ret {float, i1} %val
@@ -122,9 +122,9 @@ define {float, i1} @shfl_sync_f32_irr(float %a, i32 %b, i32 %c) {
 ; CHECK-LABEL: .func{{.*}}shfl_sync_f32_rri
 define {float, i1} @shfl_sync_f32_rri(i32 %mask, float %a, i32 %b) {
   ; CHECK: ld.param.b32 [[MASK:%r[0-9]+]]
-  ; CHECK: ld.param.b32 [[A:%f[0-9]+]]
+  ; CHECK: ld.param.b32 [[A:%r[0-9]+]]
   ; CHECK: ld.param.b32 [[B:%r[0-9]+]]
-  ; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], 1, [[MASK]];
+  ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], 1, [[MASK]];
   ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
   %val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 %mask, float %a, i32 %b, i32 1)
   ret {float, i1} %val
@@ -132,9 +132,9 @@ define {float, i1} @shfl_sync_f32_rri(i32 %mask, float %a, i32 %b) {
 
 ; CHECK-LABEL: .func{{.*}}shfl_sync_f32_iri
 define {float, i1} @shfl_sync_f32_iri(float %a, i32 %b) {
-  ; CHECK: ld.param.b32 [[A:%f[0-9]+]]
+  ; CHECK: ld.param.b32 [[A:%r[0-9]+]]
   ; CHECK: ld.param.b32 [[B:%r[0-9]+]]
-  ; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], 2, 1;
+  ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], 2, 1;
   ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
   %val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 1, float %a, i32 %b, i32 2)
   ret {float, i1} %val
@@ -143,9 +143,9 @@ define {float, i1} @shfl_sync_f32_iri(float %a, i32 %b) {
 ; CHECK-LABEL: .func{{.*}}shfl_sync_f32_rir
 define {float, i1} @shfl_sync_f32_rir(i32 %mask, float %a, i32 %c) {
   ; CHECK: ld.param.b32 [[MASK:%r[0-9]+]]
-  ; CHECK: ld.param.b32 [[A:%f[0-9]+]]
+  ; CHECK: ld.param.b32 [[A:%r[0-9]+]]
   ; CHECK: ld.param.b32 [[C:%r[0-9]+]]
-  ; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 1, [[C]], [[MASK]];
+  ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 1, [[C]], [[MASK]];
   ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
   %val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 %mask, float %a, i32 1, i32 %c)
   ret {float, i1} %val
@@ -153,9 +153,9 @@ define {float, i1} @shfl_sync_f32_rir(i32 %mask, float %a, i32 %c) {
 
 ; CHECK-LABEL: .func{{.*}}shfl_sync_f32_iir
 define {float, i1} @shfl_sync_f32_iir(float %a, i32 %c) {
-  ; CHECK: ld.param.b32 [[A:%f[0-9]+]]
+  ; CHECK: ld.param.b32 [[A:%r[0-9]+]]
   ; CHECK: ld.param.b32 [[C:%r[0-9]+]]
-  ; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 2, [[C]], 1;
+  ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 2, [[C]], 1;
   ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
   %val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 1, float %a, i32 2, i32 %c)
   ret {float, i1} %val
@@ -164,8 +164,8 @@ define {float, i1} @shfl_sync_f32_iir(float %a, i32 %c) {
 ; CHECK-LABEL: .func{{.*}}shfl_sync_f32_rii
 define {float, i1} @shfl_sync_f32_rii(i32 %mask, float %a) {
   ; CHECK: ld.param.b32 [[MASK:%r[0-9]+]]
-  ; CHECK: ld.param.b32 [[A:%f[0-9]+]]
-  ; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 1, 2, [[MASK]];
+  ; CHECK: ld.param.b32 [[A:%r[0-9]+]]
+  ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 1, 2, [[MASK]];
   ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
   %val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 %mask, float %a, i32 1, i32 2)
   ret {float, i1} %val
@@ -173,8 +173,8 @@ define {float, i1} @shfl_sync_f32_rii(i32 %mask, float %a) {
 
 ; CHECK-LABEL: .func{{.*}}shfl_sync_f32_iii
 define {float, i1} @shfl_sync_f32_iii(float %a, i32 %b) {
-  ; CHECK: ld.param.b32 [[A:%f[0-9]+]]
-  ; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 2, 3, 1;
+  ; CHECK: ld.param.b32 [[A:%r[0-9]+]]
+  ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 2, 3, 1;
   ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
   %val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 1, float %a, i32 2, i32 3)
   ret {float, i1} %val
diff --git a/llvm/test/CodeGen/NVPTX/shfl.ll b/llvm/test/CodeGen/NVPTX/shfl.ll
index 8aedba26b56bc..de73e2b0f8d49 100644
--- a/llvm/test/CodeGen/NVPTX/shfl.ll
+++ b/llvm/test/CodeGen/NVPTX/shfl.ll
@@ -53,8 +53,8 @@ define i32 @shfl_down4(i32 %in, i32 %width, i32 %mask) {
 ; Try shfl.down with floating-point params.
 ; CHECK-LABEL: .func{{.*}}shfl_down_float
 define float @shfl_down_float(float %in) {
-  ; CHECK: ld.param.b32 [[IN:%f[0-9]+]]
-  ; CHECK: shfl.down.b32 [[OUT:%f[0-9]+]], [[IN]], 5, 6;
+  ; CHECK: ld.param.b32 [[IN:%r[0-9]+]]
+  ; CHECK: shfl.down.b32 [[OUT:%r[0-9]+]], [[IN]], 5, 6;
   ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
   %out = call float @llvm.nvvm.shfl.down.f32(float %in, i32 5, i32 6)
   ret float %out
@@ -67,7 +67,7 @@ define void @shfl_rest(i32 %in_i32, float %in_float, ptr %out_i32, ptr %out_floa
   %up_i32 = call i32 @llvm.nvvm.shfl.up.i32(i32 %in_i32, i32 1, i32 2)
   store i32 %up_i32, ptr %out_i32
 
-  ; CHECK: shfl.up.b32 %f{{[0-9]+}}, %f{{[0-9]+}}, 3, 4;
+  ; CHECK: shfl.up.b32 %r{{[0-9]+}}, %r{{[0-9]+}}, 3, 4;
   %up_float = call float @llvm.nvvm.shfl.up.f32(float %in_float, i32 3, i32 4)
   store float %up_float, ptr %out_float
 
@@ -75,7 +75,7 @@ define void @shfl_rest(i32 %in_i32, float %in_float, ptr %out_i32, ptr %out_floa
   %bfly_i32 = call i32 @llvm.nvvm.shfl.bfly.i32(i32 %in_i32, i32 5, i32 6)
   store i32 %bfly_i32, ptr %out_i32
 
-  ; CHECK: shfl.bfly.b32 %f{{[0-9]+}}, %f{{[0-9]+}}, 7, 8;
+  ; CHECK: shfl.bfly.b32 %r{{[0-9]+}}, %r{{[0-9]+}}, 7, 8;
   %bfly_float = call float @llvm.nvvm.shfl.bfly.f32(float %in_float, i32 7, i32 8)
   store float %bfly_float, ptr %out_float
 
@@ -83,7 +83,7 @@ define void @shfl_rest(i32 %in_i32, float %in_float, ptr %out_i32, ptr %out_floa
   %idx_i32 = call i32 @llvm.nvvm.shfl.idx.i32(i32 %in_i32, i32 9, i32 10)
   store i32 %idx_i32, ptr %out_i32
 
-  ; CHECK: shfl.idx.b32 %f{{[0-9]+}}, %f{{[0-9]+}}, 11, 12;
+  ; CHECK: shfl.idx.b32 %r{{[0-9]+}}, %r{{[0-9]+}}, 11, 12;
   %idx_float = call float @llvm.nvvm.shfl.idx.f32(float %in_float, i32 11, i32 12)
   store float %idx_float, ptr %out_float
 
diff --git a/llvm/test/CodeGen/NVPTX/st-addrspace.ll b/llvm/test/CodeGen/NVPTX/st-addrspace.ll
index d2b3f2b61ffb5..1e0e75a041c14 100644
--- a/llvm/test/CodeGen/NVPTX/st-addrspace.ll
+++ b/llvm/test/CodeGen/NVPTX/st-addrspace.ll
@@ -112,24 +112,24 @@ define void @st_local_i64(ptr addrspace(5) %ptr, i64 %a) {
 ;; f32
 ; ALL-LABEL: st_global_f32
 define void @st_global_f32(ptr addrspace(1) %ptr, float %a) {
-; G32: st.global.b32 [%r{{[0-9]+}}], %f{{[0-9]+}}
-; G64: st.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+; G32: st.global.b32 [%r{{[0-9]+}}], %r{{[0-9]+}}
+; G64: st.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
 ; ALL: ret
   store float %a, ptr addrspace(1) %ptr
   ret void
 }
 ; ALL-LABEL: st_shared_f32
 define void @st_shared_f32(ptr addrspace(3) %ptr, float %a) {
-; LS32: st.shared.b32 [%r{{[0-9]+}}], %f{{[0-9]+}}
-; LS64: st.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+; LS32: st.shared.b32 [%r{{[0-9]+}}], %r{{[0-9]+}}
+; LS64: st.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
 ; ALL: ret
   store float %a, ptr addrspace(3) %ptr
   ret void
 }
 ; ALL-LABEL: st_local_f32
 define void @st_local_f32(ptr addrspace(5) %ptr, float %a) {
-; LS32: st.local.b32 [%r{{[0-9]+}}], %f{{[0-9]+}}
-; LS64: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+; LS32: st.local.b32 [%r{{[0-9]+}}], %r{{[0-9]+}}
+; LS64: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
 ; ALL: ret
   store float %a, ptr addrspace(5) %ptr
   ret void
@@ -138,24 +138,24 @@ define void @st_local_f32(ptr addrspace(5) %ptr, float %a) {
 ;; f64
 ; ALL-LABEL: st_global_f64
 define void @st_global_f64(ptr addrspace(1) %ptr, double %a) {
-; G32: st.global.b64 [%r{{[0-9]+}}], %fd{{[0-9]+}}
-; G64: st.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+; G32: st.global.b64 [%r{{[0-9]+}}], %rd{{[0-9]+}}
+; G64: st.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
 ; ALL: ret
   store double %a, ptr addrspace(1) %ptr
   ret void
 }
 ; ALL-LABEL: st_shared_f64
 define void @st_shared_f64(ptr addrspace(3) %ptr, double %a) {
-; LS32: st.shared.b64 [%r{{[0-9]+}}], %fd{{[0-9]+}}
-; LS64: st.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+; LS32: st.shared.b64 [%r{{[0-9]+}}], %rd{{[0-9]+}}
+; LS64: st.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
 ; ALL: ret
   store double %a, ptr addrspace(3) %ptr
   ret void
 }
 ; ALL-LABEL: st_local_f64
 define void @st_local_f64(ptr addrspace(5) %ptr, double %a) {
-; LS32: st.local.b64 [%r{{[0-9]+}}], %fd{{[0-9]+}}
-; LS64: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+; LS32: st.local.b64 [%r{{[0-9]+}}], %rd{{[0-9]+}}
+; LS64: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
 ; ALL: ret
   store double %a, ptr addrspace(5) %ptr
   ret void
diff --git a/llvm/test/CodeGen/NVPTX/st-generic.ll b/llvm/test/CodeGen/NVPTX/st-generic.ll
index cdf9dba825518..950da93f95217 100644
--- a/llvm/test/CodeGen/NVPTX/st-generic.ll
+++ b/llvm/test/CodeGen/NVPTX/st-generic.ll
@@ -50,9 +50,9 @@ define void @st_global_i64(ptr addrspace(0) %ptr, i64 %a) {
 ;; f32
 
 define void @st_global_f32(ptr addrspace(0) %ptr, float %a) {
-; PTX32: st.b32 [%r{{[0-9]+}}], %f{{[0-9]+}}
+; PTX32: st.b32 [%r{{[0-9]+}}], %r{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+; PTX64: st.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
 ; PTX64: ret
   store float %a, ptr addrspace(0) %ptr
   ret void
@@ -61,9 +61,9 @@ define void @st_global_f32(ptr addrspace(0) %ptr, float %a) {
 ;; f64
 
 define void @st_global_f64(ptr addrspace(0) %ptr, double %a) {
-; PTX32: st.b64 [%r{{[0-9]+}}], %fd{{[0-9]+}}
+; PTX32: st.b64 [%r{{[0-9]+}}], %rd{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+; PTX64: st.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
 ; PTX64: ret
   store double %a, ptr addrspace(0) %ptr
   ret void
diff --git a/llvm/test/CodeGen/NVPTX/st-param-imm.ll b/llvm/test/CodeGen/NVPTX/st-param-imm.ll
index 5f1ea5d7b1e26..bdab9958fe2b2 100644
--- a/llvm/test/CodeGen/NVPTX/st-param-imm.ll
+++ b/llvm/test/CodeGen/NVPTX/st-param-imm.ll
@@ -403,13 +403,13 @@ define void @st_param_v2_f32_ii(float %val) {
 define void @st_param_v2_f32_ir(float %val) {
 ; CHECK-LABEL: st_param_v2_f32_ir(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<2>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [st_param_v2_f32_ir_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [st_param_v2_f32_ir_param_0];
 ; CHECK-NEXT:    { // callseq 18, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
-; CHECK-NEXT:    st.param.v2.b32 [param0], {0f3F800000, %f1};
+; CHECK-NEXT:    st.param.v2.b32 [param0], {0f3F800000, %r1};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v2_f32,
 ; CHECK-NEXT:    (
@@ -425,13 +425,13 @@ define void @st_param_v2_f32_ir(float %val) {
 define void @st_param_v2_f32_ri(float %val) {
 ; CHECK-LABEL: st_param_v2_f32_ri(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<2>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [st_param_v2_f32_ri_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [st_param_v2_f32_ri_param_0];
 ; CHECK-NEXT:    { // callseq 19, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
-; CHECK-NEXT:    st.param.v2.b32 [param0], {%f1, 0f40000000};
+; CHECK-NEXT:    st.param.v2.b32 [param0], {%r1, 0f40000000};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v2_f32,
 ; CHECK-NEXT:    (
@@ -467,13 +467,13 @@ define void @st_param_v2_f64_ii(double %val) {
 define void @st_param_v2_f64_ir(double %val) {
 ; CHECK-LABEL: st_param_v2_f64_ir(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %fd<2>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b64 %fd1, [st_param_v2_f64_ir_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [st_param_v2_f64_ir_param_0];
 ; CHECK-NEXT:    { // callseq 21, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v2.b64 [param0], {0d3FF0000000000000, %fd1};
+; CHECK-NEXT:    st.param.v2.b64 [param0], {0d3FF0000000000000, %rd1};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v2_f64,
 ; CHECK-NEXT:    (
@@ -489,13 +489,13 @@ define void @st_param_v2_f64_ir(double %val) {
 define void @st_param_v2_f64_ri(double %val) {
 ; CHECK-LABEL: st_param_v2_f64_ri(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %fd<2>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b64 %fd1, [st_param_v2_f64_ri_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [st_param_v2_f64_ri_param_0];
 ; CHECK-NEXT:    { // callseq 22, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v2.b64 [param0], {%fd1, 0d4000000000000000};
+; CHECK-NEXT:    st.param.v2.b64 [param0], {%rd1, 0d4000000000000000};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v2_f64,
 ; CHECK-NEXT:    (
@@ -1648,15 +1648,15 @@ define void @st_param_v4_f32_iiii() {
 define void @st_param_v4_f32_irrr(float %b, float %c, float %d) {
 ; CHECK-LABEL: st_param_v4_f32_irrr(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<4>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [st_param_v4_f32_irrr_param_0];
-; CHECK-NEXT:    ld.param.b32 %f2, [st_param_v4_f32_irrr_param_1];
-; CHECK-NEXT:    ld.param.b32 %f3, [st_param_v4_f32_irrr_param_2];
+; CHECK-NEXT:    ld.param.b32 %r1, [st_param_v4_f32_irrr_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [st_param_v4_f32_irrr_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [st_param_v4_f32_irrr_param_2];
 ; CHECK-NEXT:    { // callseq 69, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.b32 [param0], {0f3F800000, %f1, %f2, %f3};
+; CHECK-NEXT:    st.param.v4.b32 [param0], {0f3F800000, %r1, %r2, %r3};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_f32,
 ; CHECK-NEXT:    (
@@ -1674,15 +1674,15 @@ define void @st_param_v4_f32_irrr(float %b, float %c, float %d) {
 define void @st_param_v4_f32_rirr(float %a, float %c, float %d) {
 ; CHECK-LABEL: st_param_v4_f32_rirr(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<4>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [st_param_v4_f32_rirr_param_0];
-; CHECK-NEXT:    ld.param.b32 %f2, [st_param_v4_f32_rirr_param_1];
-; CHECK-NEXT:    ld.param.b32 %f3, [st_param_v4_f32_rirr_param_2];
+; CHECK-NEXT:    ld.param.b32 %r1, [st_param_v4_f32_rirr_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [st_param_v4_f32_rirr_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [st_param_v4_f32_rirr_param_2];
 ; CHECK-NEXT:    { // callseq 70, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.b32 [param0], {%f1, 0f40000000, %f2, %f3};
+; CHECK-NEXT:    st.param.v4.b32 [param0], {%r1, 0f40000000, %r2, %r3};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_f32,
 ; CHECK-NEXT:    (
@@ -1700,15 +1700,15 @@ define void @st_param_v4_f32_rirr(float %a, float %c, float %d) {
 define void @st_param_v4_f32_rrir(float %a, float %b, float %d) {
 ; CHECK-LABEL: st_param_v4_f32_rrir(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<4>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [st_param_v4_f32_rrir_param_0];
-; CHECK-NEXT:    ld.param.b32 %f2, [st_param_v4_f32_rrir_param_1];
-; CHECK-NEXT:    ld.param.b32 %f3, [st_param_v4_f32_rrir_param_2];
+; CHECK-NEXT:    ld.param.b32 %r1, [st_param_v4_f32_rrir_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [st_param_v4_f32_rrir_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [st_param_v4_f32_rrir_param_2];
 ; CHECK-NEXT:    { // callseq 71, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.b32 [param0], {%f1, %f2, 0f40400000, %f3};
+; CHECK-NEXT:    st.param.v4.b32 [param0], {%r1, %r2, 0f40400000, %r3};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_f32,
 ; CHECK-NEXT:    (
@@ -1726,15 +1726,15 @@ define void @st_param_v4_f32_rrir(float %a, float %b, float %d) {
 define void @st_param_v4_f32_rrri(float %a, float %b, float %c) {
 ; CHECK-LABEL: st_param_v4_f32_rrri(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<4>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [st_param_v4_f32_rrri_param_0];
-; CHECK-NEXT:    ld.param.b32 %f2, [st_param_v4_f32_rrri_param_1];
-; CHECK-NEXT:    ld.param.b32 %f3, [st_param_v4_f32_rrri_param_2];
+; CHECK-NEXT:    ld.param.b32 %r1, [st_param_v4_f32_rrri_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [st_param_v4_f32_rrri_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [st_param_v4_f32_rrri_param_2];
 ; CHECK-NEXT:    { // callseq 72, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.b32 [param0], {%f1, %f2, %f3, 0f40800000};
+; CHECK-NEXT:    st.param.v4.b32 [param0], {%r1, %r2, %r3, 0f40800000};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_f32,
 ; CHECK-NEXT:    (
@@ -1752,14 +1752,14 @@ define void @st_param_v4_f32_rrri(float %a, float %b, float %c) {
 define void @st_param_v4_f32_iirr(float %c, float %d) {
 ; CHECK-LABEL: st_param_v4_f32_iirr(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [st_param_v4_f32_iirr_param_0];
-; CHECK-NEXT:    ld.param.b32 %f2, [st_param_v4_f32_iirr_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [st_param_v4_f32_iirr_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [st_param_v4_f32_iirr_param_1];
 ; CHECK-NEXT:    { // callseq 73, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.b32 [param0], {0f3F800000, 0f40000000, %f1, %f2};
+; CHECK-NEXT:    st.param.v4.b32 [param0], {0f3F800000, 0f40000000, %r1, %r2};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_f32,
 ; CHECK-NEXT:    (
@@ -1777,14 +1777,14 @@ define void @st_param_v4_f32_iirr(float %c, float %d) {
 define void @st_param_v4_f32_irir(float %b, float %d) {
 ; CHECK-LABEL: st_param_v4_f32_irir(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [st_param_v4_f32_irir_param_0];
-; CHECK-NEXT:    ld.param.b32 %f2, [st_param_v4_f32_irir_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [st_param_v4_f32_irir_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [st_param_v4_f32_irir_param_1];
 ; CHECK-NEXT:    { // callseq 74, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.b32 [param0], {0f3F800000, %f1, 0f40400000, %f2};
+; CHECK-NEXT:    st.param.v4.b32 [param0], {0f3F800000, %r1, 0f40400000, %r2};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_f32,
 ; CHECK-NEXT:    (
@@ -1802,14 +1802,14 @@ define void @st_param_v4_f32_irir(float %b, float %d) {
 define void @st_param_v4_f32_irri(float %b, float %c) {
 ; CHECK-LABEL: st_param_v4_f32_irri(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [st_param_v4_f32_irri_param_0];
-; CHECK-NEXT:    ld.param.b32 %f2, [st_param_v4_f32_irri_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [st_param_v4_f32_irri_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [st_param_v4_f32_irri_param_1];
 ; CHECK-NEXT:    { // callseq 75, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.b32 [param0], {0f3F800000, %f1, %f2, 0f40800000};
+; CHECK-NEXT:    st.param.v4.b32 [param0], {0f3F800000, %r1, %r2, 0f40800000};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_f32,
 ; CHECK-NEXT:    (
@@ -1827,14 +1827,14 @@ define void @st_param_v4_f32_irri(float %b, float %c) {
 define void @st_param_v4_f32_riir(float %a, float %d) {
 ; CHECK-LABEL: st_param_v4_f32_riir(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [st_param_v4_f32_riir_param_0];
-; CHECK-NEXT:    ld.param.b32 %f2, [st_param_v4_f32_riir_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [st_param_v4_f32_riir_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [st_param_v4_f32_riir_param_1];
 ; CHECK-NEXT:    { // callseq 76, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.b32 [param0], {%f1, 0f40000000, 0f40400000, %f2};
+; CHECK-NEXT:    st.param.v4.b32 [param0], {%r1, 0f40000000, 0f40400000, %r2};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_f32,
 ; CHECK-NEXT:    (
@@ -1852,14 +1852,14 @@ define void @st_param_v4_f32_riir(float %a, float %d) {
 define void @st_param_v4_f32_riri(float %a, float %c) {
 ; CHECK-LABEL: st_param_v4_f32_riri(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [st_param_v4_f32_riri_param_0];
-; CHECK-NEXT:    ld.param.b32 %f2, [st_param_v4_f32_riri_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [st_param_v4_f32_riri_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [st_param_v4_f32_riri_param_1];
 ; CHECK-NEXT:    { // callseq 77, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.b32 [param0], {%f1, 0f40000000, %f2, 0f40800000};
+; CHECK-NEXT:    st.param.v4.b32 [param0], {%r1, 0f40000000, %r2, 0f40800000};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_f32,
 ; CHECK-NEXT:    (
@@ -1877,14 +1877,14 @@ define void @st_param_v4_f32_riri(float %a, float %c) {
 define void @st_param_v4_f32_rrii(float %a, float %b) {
 ; CHECK-LABEL: st_param_v4_f32_rrii(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [st_param_v4_f32_rrii_param_0];
-; CHECK-NEXT:    ld.param.b32 %f2, [st_param_v4_f32_rrii_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [st_param_v4_f32_rrii_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [st_param_v4_f32_rrii_param_1];
 ; CHECK-NEXT:    { // callseq 78, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.b32 [param0], {%f1, %f2, 0f40400000, 0f40800000};
+; CHECK-NEXT:    st.param.v4.b32 [param0], {%r1, %r2, 0f40400000, 0f40800000};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_f32,
 ; CHECK-NEXT:    (
@@ -1902,13 +1902,13 @@ define void @st_param_v4_f32_rrii(float %a, float %b) {
 define void @st_param_v4_f32_iiir(float %d) {
 ; CHECK-LABEL: st_param_v4_f32_iiir(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<2>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [st_param_v4_f32_iiir_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [st_param_v4_f32_iiir_param_0];
 ; CHECK-NEXT:    { // callseq 79, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.b32 [param0], {0f3F800000, 0f40000000, 0f40400000, %f1};
+; CHECK-NEXT:    st.param.v4.b32 [param0], {0f3F800000, 0f40000000, 0f40400000, %r1};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_f32,
 ; CHECK-NEXT:    (
@@ -1926,13 +1926,13 @@ define void @st_param_v4_f32_iiir(float %d) {
 define void @st_param_v4_f32_iiri(float %c) {
 ; CHECK-LABEL: st_param_v4_f32_iiri(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<2>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [st_param_v4_f32_iiri_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [st_param_v4_f32_iiri_param_0];
 ; CHECK-NEXT:    { // callseq 80, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.b32 [param0], {0f3F800000, 0f40000000, %f1, 0f40800000};
+; CHECK-NEXT:    st.param.v4.b32 [param0], {0f3F800000, 0f40000000, %r1, 0f40800000};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_f32,
 ; CHECK-NEXT:    (
@@ -1950,13 +1950,13 @@ define void @st_param_v4_f32_iiri(float %c) {
 define void @st_param_v4_f32_irii(float %b) {
 ; CHECK-LABEL: st_param_v4_f32_irii(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<2>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [st_param_v4_f32_irii_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [st_param_v4_f32_irii_param_0];
 ; CHECK-NEXT:    { // callseq 81, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.b32 [param0], {0f3F800000, %f1, 0f40400000, 0f40800000};
+; CHECK-NEXT:    st.param.v4.b32 [param0], {0f3F800000, %r1, 0f40400000, 0f40800000};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_f32,
 ; CHECK-NEXT:    (
@@ -1974,13 +1974,13 @@ define void @st_param_v4_f32_irii(float %b) {
 define void @st_param_v4_f32_riii(float %a) {
 ; CHECK-LABEL: st_param_v4_f32_riii(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %f<2>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %f1, [st_param_v4_f32_riii_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [st_param_v4_f32_riii_param_0];
 ; CHECK-NEXT:    { // callseq 82, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.b32 [param0], {%f1, 0f40000000, 0f40400000, 0f40800000};
+; CHECK-NEXT:    st.param.v4.b32 [param0], {%r1, 0f40000000, 0f40400000, 0f40800000};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_f32,
 ; CHECK-NEXT:    (
@@ -2003,21 +2003,21 @@ declare void @call_v4_f32(%struct.float4 alignstack(16))
 
 define void @st_param_bfloat() {
 ; CHECK-LABEL: st_param_bfloat(
-; CHECK: {
-; CHECK-NEXT:	.reg .b16 	%rs<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT:// %bb.0:
-; CHECK-NEXT:	mov.b16 	%rs1, 0x4100;
-; CHECK-NEXT:	{ // callseq 83, 0
-; CHECK-NEXT:	.param .align 2 .b8 param0[2];
-; CHECK-NEXT:	st.param.b16 	[param0], %rs1;
-; CHECK-NEXT:	call.uni
-; CHECK-NEXT:	call_bfloat,
-; CHECK-NEXT:	(
-; CHECK-NEXT:	param0
-; CHECK-NEXT:	);
-; CHECK-NEXT:	} // callseq 83
-; CHECK-NEXT:	ret;
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.b16 %rs1, 0x4100;
+; CHECK-NEXT:    { // callseq 83, 0
+; CHECK-NEXT:    .param .align 2 .b8 param0[2];
+; CHECK-NEXT:    st.param.b16 [param0], %rs1;
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_bfloat,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 83
+; CHECK-NEXT:    ret;
   %five = bitcast i16 16640 to bfloat
   call void @call_bfloat(bfloat %five)
   ret void
diff --git a/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll b/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll
index ae74bbb866eb2..8056855a0d539 100644
--- a/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll
+++ b/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll
@@ -13,8 +13,7 @@ declare i64 @llvm.nvvm.texsurf.handle.internal.p1(ptr addrspace(1))
 define ptx_kernel void @foo(i64 %img, ptr %red, i32 %idx) {
 ; CHECK-LABEL: foo(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<3>;
-; CHECK-NEXT:    .reg .b32 %f<2>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
@@ -23,8 +22,8 @@ define ptx_kernel void @foo(i64 %img, ptr %red, i32 %idx) {
 ; CHECK-NEXT:    cvta.to.global.u64 %rd3, %rd2;
 ; CHECK-NEXT:    ld.param.b32 %r1, [foo_param_2];
 ; CHECK-NEXT:    suld.b.1d.b32.trap {%r2}, [%rd1, {%r1}];
-; CHECK-NEXT:    cvt.rn.f32.s32 %f1, %r2;
-; CHECK-NEXT:    st.global.b32 [%rd3], %f1;
+; CHECK-NEXT:    cvt.rn.f32.s32 %r3, %r2;
+; CHECK-NEXT:    st.global.b32 [%rd3], %r3;
 ; CHECK-NEXT:    ret;
   %val = tail call i32 @llvm.nvvm.suld.1d.i32.trap(i64 %img, i32 %idx)
   %ret = sitofp i32 %val to float
@@ -37,8 +36,7 @@ define ptx_kernel void @foo(i64 %img, ptr %red, i32 %idx) {
 define ptx_kernel void @bar(ptr %red, i32 %idx) {
 ; CHECK-LABEL: bar(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<3>;
-; CHECK-NEXT:    .reg .b32 %f<2>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
@@ -46,8 +44,8 @@ define ptx_kernel void @bar(ptr %red, i32 %idx) {
 ; CHECK-NEXT:    cvta.to.global.u64 %rd2, %rd1;
 ; CHECK-NEXT:    ld.param.b32 %r1, [bar_param_1];
 ; CHECK-NEXT:    suld.b.1d.b32.trap {%r2}, [surf0, {%r1}];
-; CHECK-NEXT:    cvt.rn.f32.s32 %f1, %r2;
-; CHECK-NEXT:    st.global.b32 [%rd2], %f1;
+; CHECK-NEXT:    cvt.rn.f32.s32 %r3, %r2;
+; CHECK-NEXT:    st.global.b32 [%rd2], %r3;
 ; CHECK-NEXT:    ret;
   %surfHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1(ptr addrspace(1) @surf0)
   %val = tail call i32 @llvm.nvvm.suld.1d.i32.trap(i64 %surfHandle, i32 %idx)
diff --git a/llvm/test/CodeGen/NVPTX/surf-read.ll b/llvm/test/CodeGen/NVPTX/surf-read.ll
index 8dee5250920e6..edcb289d788f7 100644
--- a/llvm/test/CodeGen/NVPTX/surf-read.ll
+++ b/llvm/test/CodeGen/NVPTX/surf-read.ll
@@ -10,9 +10,9 @@ define ptx_kernel void @foo(i64 %img, ptr %red, i32 %idx) {
 ; CHECK: .param .surfref foo_param_0
 ; CHECK: suld.b.1d.b32.trap {%r[[RED:[0-9]+]]}, [foo_param_0, {%r{{[0-9]+}}}]
   %val = tail call i32 @llvm.nvvm.suld.1d.i32.trap(i64 %img, i32 %idx)
-; CHECK: cvt.rn.f32.s32 %f[[REDF:[0-9]+]], %r[[RED]]
+; CHECK: cvt.rn.f32.s32 %r[[REDF:[0-9]+]], %r[[RED]]
   %ret = sitofp i32 %val to float
-; CHECK: st.b32 [%rd{{[0-9]+}}], %f[[REDF]]
+; CHECK: st.b32 [%rd{{[0-9]+}}], %r[[REDF]]
   store float %ret, ptr %red
   ret void
 }
diff --git a/llvm/test/CodeGen/NVPTX/surf-tex.py b/llvm/test/CodeGen/NVPTX/surf-tex.py
index 90d67666f1ed6..15b220ca2175f 100644
--- a/llvm/test/CodeGen/NVPTX/surf-tex.py
+++ b/llvm/test/CodeGen/NVPTX/surf-tex.py
@@ -48,7 +48,7 @@ def get_ptx_reg(ty):
         "b16": "%rs{{[0-9]+}}",
         "b32": "%r{{[0-9]+}}",
         "b64": "%rd{{[0-9]+}}",
-        "f32": "%f{{[0-9]+}}",
+        "f32": "%r{{[0-9]+}}",
         "u32": "%r{{[0-9]+}}",
         "s32": "%r{{[0-9]+}}",
     }
@@ -757,10 +757,10 @@ def get_llvm_tld4_access_type(geom):
 
 def get_ptx_tld4_access(geom):
     geom_to_access = {
-        "2d": "{%f{{[0-9]+}}, %f{{[0-9]+}}}",
-        "a2d": "{%r{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}",
-        "cube": "{%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}",
-        "acube": "{%r{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}",
+        "2d": "{%r{{[0-9]+}}, %r{{[0-9]+}}}",
+        "a2d": "{%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}",
+        "cube": "{%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}",
+        "acube": "{%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}",
     }
     return geom_to_access[geom]
 
diff --git a/llvm/test/CodeGen/NVPTX/tag-invariant-loads.ll b/llvm/test/CodeGen/NVPTX/tag-invariant-loads.ll
index 39cd054716b5f..73ac6e95e8c93 100644
--- a/llvm/test/CodeGen/NVPTX/tag-invariant-loads.ll
+++ b/llvm/test/CodeGen/NVPTX/tag-invariant-loads.ll
@@ -14,16 +14,15 @@ define ptx_kernel void @basic(ptr noalias readonly %a, ptr %out) {
 ;
 ; PTX-LABEL: basic(
 ; PTX:       {
-; PTX-NEXT:    .reg .b32 %r<5>;
-; PTX-NEXT:    .reg .b32 %f<2>;
+; PTX-NEXT:    .reg .b32 %r<6>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
 ; PTX-NEXT:    ld.param.b32 %r1, [basic_param_0];
 ; PTX-NEXT:    cvta.to.global.u32 %r2, %r1;
 ; PTX-NEXT:    ld.param.b32 %r3, [basic_param_1];
 ; PTX-NEXT:    cvta.to.global.u32 %r4, %r3;
-; PTX-NEXT:    ld.global.nc.b32 %f1, [%r2];
-; PTX-NEXT:    st.global.b32 [%r4], %f1;
+; PTX-NEXT:    ld.global.nc.b32 %r5, [%r2];
+; PTX-NEXT:    st.global.b32 [%r4], %r5;
 ; PTX-NEXT:    ret;
   %a_global = addrspacecast ptr %a to ptr addrspace(1)
   %val = load float, ptr addrspace(1) %a_global
@@ -77,15 +76,14 @@ define void @not_kernel(ptr noalias readonly %a, ptr %out) {
 ;
 ; PTX-LABEL: not_kernel(
 ; PTX:       {
-; PTX-NEXT:    .reg .b32 %r<4>;
-; PTX-NEXT:    .reg .b32 %f<2>;
+; PTX-NEXT:    .reg .b32 %r<5>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
 ; PTX-NEXT:    ld.param.b32 %r1, [not_kernel_param_0];
 ; PTX-NEXT:    cvta.to.global.u32 %r2, %r1;
 ; PTX-NEXT:    ld.param.b32 %r3, [not_kernel_param_1];
-; PTX-NEXT:    ld.global.b32 %f1, [%r2];
-; PTX-NEXT:    st.b32 [%r3], %f1;
+; PTX-NEXT:    ld.global.b32 %r4, [%r2];
+; PTX-NEXT:    st.b32 [%r3], %r4;
 ; PTX-NEXT:    ret;
   %a_global = addrspacecast ptr %a to ptr addrspace(1)
   %val = load float, ptr addrspace(1) %a_global
diff --git a/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll b/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll
index 3d6489a2340da..a97a8b5822f99 100644
--- a/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll
+++ b/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll
@@ -13,8 +13,7 @@ declare i64 @llvm.nvvm.texsurf.handle.internal.p1(ptr addrspace(1))
 define ptx_kernel void @foo(i64 %img, ptr %red, i32 %idx) {
 ; CHECK-LABEL: foo(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<5>;
+; CHECK-NEXT:    .reg .b32 %r<6>;
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
@@ -22,8 +21,8 @@ define ptx_kernel void @foo(i64 %img, ptr %red, i32 %idx) {
 ; CHECK-NEXT:    ld.param.b64 %rd2, [foo_param_1];
 ; CHECK-NEXT:    cvta.to.global.u64 %rd3, %rd2;
 ; CHECK-NEXT:    ld.param.b32 %r1, [foo_param_2];
-; CHECK-NEXT:    tex.1d.v4.f32.s32 {%f1, %f2, %f3, %f4}, [%rd1, {%r1}];
-; CHECK-NEXT:    st.global.b32 [%rd3], %f1;
+; CHECK-NEXT:    tex.1d.v4.f32.s32 {%r2, %r3, %r4, %r5}, [%rd1, {%r1}];
+; CHECK-NEXT:    st.global.b32 [%rd3], %r2;
 ; CHECK-NEXT:    ret;
   %val = tail call { float, float, float, float } @llvm.nvvm.tex.unified.1d.v4f32.s32(i64 %img, i32 %idx)
   %ret = extractvalue { float, float, float, float } %val, 0
@@ -37,16 +36,15 @@ define ptx_kernel void @foo(i64 %img, ptr %red, i32 %idx) {
 define ptx_kernel void @bar(ptr %red, i32 %idx) {
 ; CHECK-LABEL: bar(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<5>;
+; CHECK-NEXT:    .reg .b32 %r<6>;
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [bar_param_0];
 ; CHECK-NEXT:    cvta.to.global.u64 %rd2, %rd1;
 ; CHECK-NEXT:    ld.param.b32 %r1, [bar_param_1];
-; CHECK-NEXT:    tex.1d.v4.f32.s32 {%f1, %f2, %f3, %f4}, [tex0, {%r1}];
-; CHECK-NEXT:    st.global.b32 [%rd2], %f1;
+; CHECK-NEXT:    tex.1d.v4.f32.s32 {%r2, %r3, %r4, %r5}, [tex0, {%r1}];
+; CHECK-NEXT:    st.global.b32 [%rd2], %r2;
 ; CHECK-NEXT:    ret;
   %texHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1(ptr addrspace(1) @tex0)
   %val = tail call { float, float, float, float } @llvm.nvvm.tex.unified.1d.v4f32.s32(i64 %texHandle, i32 %idx)
@@ -60,8 +58,7 @@ declare float @texfunc(i64)
 define ptx_kernel void @baz(ptr %red, i32 %idx) {
 ; CHECK-LABEL: baz(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b32 %f<8>;
+; CHECK-NEXT:    .reg .b32 %r<9>;
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
@@ -69,7 +66,7 @@ define ptx_kernel void @baz(ptr %red, i32 %idx) {
 ; CHECK-NEXT:    cvta.to.global.u64 %rd2, %rd1;
 ; CHECK-NEXT:    ld.param.b32 %r1, [baz_param_1];
 ; CHECK-NEXT:    mov.u64 %rd3, tex0;
-; CHECK-NEXT:    tex.1d.v4.f32.s32 {%f1, %f2, %f3, %f4}, [tex0, {%r1}];
+; CHECK-NEXT:    tex.1d.v4.f32.s32 {%r2, %r3, %r4, %r5}, [tex0, {%r1}];
 ; CHECK-NEXT:    { // callseq 0, 0
 ; CHECK-NEXT:    .param .b64 param0;
 ; CHECK-NEXT:    st.param.b64 [param0], %rd3;
@@ -79,10 +76,10 @@ define ptx_kernel void @baz(ptr %red, i32 %idx) {
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
-; CHECK-NEXT:    ld.param.b32 %f5, [retval0];
+; CHECK-NEXT:    ld.param.b32 %r6, [retval0];
 ; CHECK-NEXT:    } // callseq 0
-; CHECK-NEXT:    add.rn.f32 %f7, %f1, %f5;
-; CHECK-NEXT:    st.global.b32 [%rd2], %f7;
+; CHECK-NEXT:    add.rn.f32 %r8, %r2, %r6;
+; CHECK-NEXT:    st.global.b32 [%rd2], %r8;
 ; CHECK-NEXT:    ret;
   %texHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1(ptr addrspace(1) @tex0)
   %val = tail call { float, float, float, float } @llvm.nvvm.tex.unified.1d.v4f32.s32(i64 %texHandle, i32 %idx)
diff --git a/llvm/test/CodeGen/NVPTX/tex-read.ll b/llvm/test/CodeGen/NVPTX/tex-read.ll
index 22116b2fafc39..01b816a665d65 100644
--- a/llvm/test/CodeGen/NVPTX/tex-read.ll
+++ b/llvm/test/CodeGen/NVPTX/tex-read.ll
@@ -7,10 +7,10 @@ declare { float, float, float, float } @llvm.nvvm.tex.1d.v4f32.s32(i64, i64, i32
 
 ; CHECK: .entry foo
 define ptx_kernel void @foo(i64 %img, i64 %sampler, ptr %red, i32 %idx) {
-; CHECK: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [foo_param_0, foo_param_1, {%r{{[0-9]+}}}]
+; CHECK: tex.1d.v4.f32.s32 {%r[[RED:[0-9]+]], %r[[GREEN:[0-9]+]], %r[[BLUE:[0-9]+]], %r[[ALPHA:[0-9]+]]}, [foo_param_0, foo_param_1, {%r{{[0-9]+}}}]
   %val = tail call { float, float, float, float } @llvm.nvvm.tex.1d.v4f32.s32(i64 %img, i64 %sampler, i32 %idx)
   %ret = extractvalue { float, float, float, float } %val, 0
-; CHECK: st.b32 [%rd{{[0-9]+}}], %f[[RED]]
+; CHECK: st.b32 [%rd{{[0-9]+}}], %r[[RED]]
   store float %ret, ptr %red
   ret void
 }
diff --git a/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll b/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll
index 82ebb0ca57377..efbac868dba38 100644
--- a/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll
+++ b/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll
@@ -265,7 +265,7 @@ define %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a) {
 ; CHECK:       .visible .func (.param .align 8 .b8 func_retval0[24])
 ; CHECK-LABEL: test_s_i8f32p(
 ; CHECK:        .param .align 8 .b8 test_s_i8f32p_param_0[24]
-; CHECK-DAG:    ld.param.b32 [[P0:%f[0-9]+]],    [test_s_i8f32p_param_0];
+; CHECK-DAG:    ld.param.b32 [[P0:%r[0-9]+]],    [test_s_i8f32p_param_0];
 ; CHECK-DAG:    ld.param.b8  [[P2_0:%r[0-9]+]],   [test_s_i8f32p_param_0+5];
 ; CHECK-DAG:    ld.param.b8  [[P2_1:%r[0-9]+]],   [test_s_i8f32p_param_0+6];
 ; CHECK-DAG:    ld.param.b8  [[P2_2:%r[0-9]+]],   [test_s_i8f32p_param_0+7];
@@ -291,7 +291,7 @@ define %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a) {
 ; CHECK-NEXT:   (
 ; CHECK-NEXT:   param0
 ; CHECK-NEXT:   );
-; CHECK-DAG:    ld.param.b32 [[R0:%f[0-9]+]],    [retval0];
+; CHECK-DAG:    ld.param.b32 [[R0:%r[0-9]+]],    [retval0];
 ; CHECK-DAG:    ld.param.b8  [[R2_0:%rs[0-9]+]], [retval0+5];
 ; CHECK-DAG:    ld.param.b8  [[R2_1:%rs[0-9]+]], [retval0+6];
 ; CHECK-DAG:    ld.param.b8  [[R2_2:%rs[0-9]+]], [retval0+7];
@@ -312,7 +312,7 @@ define %s_i8f32p @test_s_i8f32p(%s_i8f32p %a) {
 ; CHECK:       .visible .func (.param .align 8 .b8 func_retval0[32])
 ; CHECK-LABEL: test_s_i8f64p(
 ; CHECK:        .param .align 8 .b8 test_s_i8f64p_param_0[32]
-; CHECK-DAG:    ld.param.b64 [[P0:%fd[0-9]+]],    [test_s_i8f64p_param_0];
+; CHECK-DAG:    ld.param.b64 [[P0:%rd[0-9]+]],    [test_s_i8f64p_param_0];
 ; CHECK-DAG:    ld.param.b8  [[P2_0:%rd[0-9]+]],   [test_s_i8f64p_param_0+9];
 ; CHECK-DAG:    ld.param.b8  [[P2_1:%rd[0-9]+]],   [test_s_i8f64p_param_0+10];
 ; CHECK-DAG:    ld.param.b8  [[P2_2:%rd[0-9]+]],   [test_s_i8f64p_param_0+11];
@@ -358,7 +358,7 @@ define %s_i8f32p @test_s_i8f32p(%s_i8f32p %a) {
 ; CHECK-NEXT:   (
 ; CHECK-NEXT:   param0
 ; CHECK-NEXT:   );
-; CHECK-DAG:    ld.param.b64 [[R0:%fd[0-9]+]],   [retval0];
+; CHECK-DAG:    ld.param.b64 [[R0:%rd[0-9]+]],   [retval0];
 ; CHECK-DAG:    ld.param.b8  [[R2_0:%rs[0-9]+]], [retval0+9];
 ; CHECK-DAG:    ld.param.b8  [[R2_1:%rs[0-9]+]], [retval0+10];
 ; CHECK-DAG:    ld.param.b8  [[R2_2:%rs[0-9]+]], [retval0+11];
diff --git a/llvm/test/CodeGen/NVPTX/vaargs.ll b/llvm/test/CodeGen/NVPTX/vaargs.ll
index 0cd0d29294c32..3ca729f07af8a 100644
--- a/llvm/test/CodeGen/NVPTX/vaargs.ll
+++ b/llvm/test/CodeGen/NVPTX/vaargs.ll
@@ -53,7 +53,7 @@ entry:
 ; CHECK-NEXT:    and.b[[BITS]] [[VA_PTR_ALIGN:%(r|rd)[0-9]+]], [[VA_PTR_TMP]], -8;
 ; CHECK-NEXT:    add.s[[BITS]] [[VA_PTR_NEXT:%(r|rd)[0-9]+]], [[VA_PTR_ALIGN]], 8;
 ; CHECK-NEXT:    st.b[[BITS]] [%SP], [[VA_PTR_NEXT]];
-; CHECK-NEXT:    ld.local.b64 %fd{{[0-9]+}}, [[[VA_PTR_ALIGN]]];
+; CHECK-NEXT:    ld.local.b64 %rd{{[0-9]+}}, [[[VA_PTR_ALIGN]]];
 
   %2 = va_arg ptr %al, double
 
@@ -84,7 +84,7 @@ define i32 @test_foo(i32 %i, i64 %l, double %d, ptr %p) {
 ; Load arguments to temporary variables
 ; CHECK32:       ld.param.b32 [[ARG_VOID_PTR:%r[0-9]+]], [test_foo_param_3];
 ; CHECK64:       ld.param.b64 [[ARG_VOID_PTR:%rd[0-9]+]], [test_foo_param_3];
-; CHECK-NEXT:    ld.param.b64 [[ARG_DOUBLE:%fd[0-9]+]], [test_foo_param_2];
+; CHECK-NEXT:    ld.param.b64 [[ARG_DOUBLE:%rd[0-9]+]], [test_foo_param_2];
 ; CHECK-NEXT:    ld.param.b64 [[ARG_I64:%rd[0-9]+]], [test_foo_param_1];
 ; CHECK-NEXT:    ld.param.b32 [[ARG_I32:%r[0-9]+]], [test_foo_param_0];
 
diff --git a/llvm/test/CodeGen/NVPTX/variadics-backend.ll b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
index 3235587f3d563..3bbdf641ade26 100644
--- a/llvm/test/CodeGen/NVPTX/variadics-backend.ll
+++ b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
@@ -12,8 +12,7 @@ define dso_local i32 @variadics1(i32 noundef %first, ...) {
 ; CHECK-PTX-LABEL: variadics1(
 ; CHECK-PTX:       {
 ; CHECK-PTX-NEXT:    .reg .b32 %r<11>;
-; CHECK-PTX-NEXT:    .reg .b64 %rd<11>;
-; CHECK-PTX-NEXT:    .reg .b64 %fd<7>;
+; CHECK-PTX-NEXT:    .reg .b64 %rd<17>;
 ; CHECK-PTX-EMPTY:
 ; CHECK-PTX-NEXT:  // %bb.0: // %entry
 ; CHECK-PTX-NEXT:    ld.param.b32 %r1, [variadics1_param_0];
@@ -32,16 +31,16 @@ define dso_local i32 @variadics1(i32 noundef %first, ...) {
 ; CHECK-PTX-NEXT:    cvt.u32.u64 %r8, %rd6;
 ; CHECK-PTX-NEXT:    add.s64 %rd7, %rd3, 15;
 ; CHECK-PTX-NEXT:    and.b64 %rd8, %rd7, -8;
-; CHECK-PTX-NEXT:    ld.b64 %fd1, [%rd8];
-; CHECK-PTX-NEXT:    cvt.rn.f64.s32 %fd2, %r8;
-; CHECK-PTX-NEXT:    add.rn.f64 %fd3, %fd2, %fd1;
-; CHECK-PTX-NEXT:    cvt.rzi.s32.f64 %r9, %fd3;
-; CHECK-PTX-NEXT:    add.s64 %rd9, %rd8, 15;
-; CHECK-PTX-NEXT:    and.b64 %rd10, %rd9, -8;
-; CHECK-PTX-NEXT:    ld.b64 %fd4, [%rd10];
-; CHECK-PTX-NEXT:    cvt.rn.f64.s32 %fd5, %r9;
-; CHECK-PTX-NEXT:    add.rn.f64 %fd6, %fd5, %fd4;
-; CHECK-PTX-NEXT:    cvt.rzi.s32.f64 %r10, %fd6;
+; CHECK-PTX-NEXT:    ld.b64 %rd9, [%rd8];
+; CHECK-PTX-NEXT:    cvt.rn.f64.s32 %rd10, %r8;
+; CHECK-PTX-NEXT:    add.rn.f64 %rd11, %rd10, %rd9;
+; CHECK-PTX-NEXT:    cvt.rzi.s32.f64 %r9, %rd11;
+; CHECK-PTX-NEXT:    add.s64 %rd12, %rd8, 15;
+; CHECK-PTX-NEXT:    and.b64 %rd13, %rd12, -8;
+; CHECK-PTX-NEXT:    ld.b64 %rd14, [%rd13];
+; CHECK-PTX-NEXT:    cvt.rn.f64.s32 %rd15, %r9;
+; CHECK-PTX-NEXT:    add.rn.f64 %rd16, %rd15, %rd14;
+; CHECK-PTX-NEXT:    cvt.rzi.s32.f64 %r10, %rd16;
 ; CHECK-PTX-NEXT:    st.param.b32 [func_retval0], %r10;
 ; CHECK-PTX-NEXT:    ret;
 entry:
diff --git a/llvm/test/CodeGen/NVPTX/vec-param-load.ll b/llvm/test/CodeGen/NVPTX/vec-param-load.ll
index 8710d58ce6e99..765e50554c8d2 100644
--- a/llvm/test/CodeGen/NVPTX/vec-param-load.ll
+++ b/llvm/test/CodeGen/NVPTX/vec-param-load.ll
@@ -5,10 +5,10 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 
 define <16 x float> @test_v16f32(<16 x float> %a) {
 ; CHECK-LABEL: test_v16f32(
-; CHECK-DAG: ld.param.v4.b32     {[[V_12_15:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+48];
-; CHECK-DAG: ld.param.v4.b32     {[[V_8_11:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+32];
-; CHECK-DAG: ld.param.v4.b32     {[[V_4_7:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+16];
-; CHECK-DAG: ld.param.v4.b32     {[[V_0_3:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0];
+; CHECK-DAG: ld.param.v4.b32     {[[V_12_15:(%r[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+48];
+; CHECK-DAG: ld.param.v4.b32     {[[V_8_11:(%r[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+32];
+; CHECK-DAG: ld.param.v4.b32     {[[V_4_7:(%r[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+16];
+; CHECK-DAG: ld.param.v4.b32     {[[V_0_3:(%r[0-9]+[, ]*){4}]]}, [test_v16f32_param_0];
 ; CHECK-DAG: st.param.v4.b32     [func_retval0],  {[[V_0_3]]}
 ; CHECK-DAG: st.param.v4.b32     [func_retval0+16], {[[V_4_7]]}
 ; CHECK-DAG: st.param.v4.b32     [func_retval0+32], {[[V_8_11]]}
@@ -19,8 +19,8 @@ define <16 x float> @test_v16f32(<16 x float> %a) {
 
 define <8 x float> @test_v8f32(<8 x float> %a) {
 ; CHECK-LABEL: test_v8f32(
-; CHECK-DAG: ld.param.v4.b32     {[[V_4_7:(%f[0-9]+[, ]*){4}]]}, [test_v8f32_param_0+16];
-; CHECK-DAG: ld.param.v4.b32     {[[V_0_3:(%f[0-9]+[, ]*){4}]]}, [test_v8f32_param_0];
+; CHECK-DAG: ld.param.v4.b32     {[[V_4_7:(%r[0-9]+[, ]*){4}]]}, [test_v8f32_param_0+16];
+; CHECK-DAG: ld.param.v4.b32     {[[V_0_3:(%r[0-9]+[, ]*){4}]]}, [test_v8f32_param_0];
 ; CHECK-DAG: st.param.v4.b32     [func_retval0],  {[[V_0_3]]}
 ; CHECK-DAG: st.param.v4.b32     [func_retval0+16], {[[V_4_7]]}
 ; CHECK: ret;
@@ -29,7 +29,7 @@ define <8 x float> @test_v8f32(<8 x float> %a) {
 
 define <4 x float> @test_v4f32(<4 x float> %a) {
 ; CHECK-LABEL: test_v4f32(
-; CHECK-DAG: ld.param.v4.b32     {[[V_0_3:(%f[0-9]+[, ]*){4}]]}, [test_v4f32_param_0];
+; CHECK-DAG: ld.param.v4.b32     {[[V_0_3:(%r[0-9]+[, ]*){4}]]}, [test_v4f32_param_0];
 ; CHECK-DAG: st.param.v4.b32     [func_retval0],  {[[V_0_3]]}
 ; CHECK: ret;
   ret <4 x float> %a
@@ -37,7 +37,7 @@ define <4 x float> @test_v4f32(<4 x float> %a) {
 
 define <2 x float> @test_v2f32(<2 x float> %a) {
 ; CHECK-LABEL: test_v2f32(
-; CHECK-DAG: ld.param.v2.b32     {[[V_0_3:(%f[0-9]+[, ]*){2}]]}, [test_v2f32_param_0];
+; CHECK-DAG: ld.param.v2.b32     {[[V_0_3:(%r[0-9]+[, ]*){2}]]}, [test_v2f32_param_0];
 ; CHECK-DAG: st.param.v2.b32     [func_retval0],  {[[V_0_3]]}
 ; CHECK: ret;
   ret <2 x float> %a
@@ -46,8 +46,8 @@ define <2 x float> @test_v2f32(<2 x float> %a) {
 ; Oddly shaped vectors should not load any extra elements.
 define <3 x float> @test_v3f32(<3 x float> %a) {
 ; CHECK-LABEL: test_v3f32(
-; CHECK-DAG: ld.param.b32        [[V_2:%f[0-9]+]], [test_v3f32_param_0+8];
-; CHECK-DAG: ld.param.v2.b32     {[[V_0_1:(%f[0-9]+[, ]*){2}]]}, [test_v3f32_param_0];
+; CHECK-DAG: ld.param.b32        [[V_2:%r[0-9]+]], [test_v3f32_param_0+8];
+; CHECK-DAG: ld.param.v2.b32     {[[V_0_1:(%r[0-9]+[, ]*){2}]]}, [test_v3f32_param_0];
 ; CHECK-DAG: st.param.v2.b32     [func_retval0], {[[V_0_1]]}
 ; CHECK-DAG: st.param.b32        [func_retval0+8], [[V_2]]
 ; CHECK: ret;
diff --git a/llvm/test/CodeGen/NVPTX/vector-args.ll b/llvm/test/CodeGen/NVPTX/vector-args.ll
index 192cd562d67b9..b08c19206a0b8 100644
--- a/llvm/test/CodeGen/NVPTX/vector-args.ll
+++ b/llvm/test/CodeGen/NVPTX/vector-args.ll
@@ -4,7 +4,7 @@
 define float @foo(<2 x float> %a) {
 ; CHECK: .func (.param .b32 func_retval0) foo
 ; CHECK: .param .align 8 .b8 foo_param_0[8]
-; CHECK: ld.param.v2.b32 {%f{{[0-9]+}}, %f{{[0-9]+}}}
+; CHECK: ld.param.v2.b32 {%r{{[0-9]+}}, %r{{[0-9]+}}}
   %t1 = fmul <2 x float> %a, %a
   %t2 = extractelement <2 x float> %t1, i32 0
   %t3 = extractelement <2 x float> %t1, i32 1
@@ -16,7 +16,7 @@ define float @foo(<2 x float> %a) {
 define float @bar(<4 x float> %a) {
 ; CHECK: .func (.param .b32 func_retval0) bar
 ; CHECK: .param .align 16 .b8 bar_param_0[16]
-; CHECK: ld.param.v4.b32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
+; CHECK: ld.param.v4.b32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
   %t1 = fmul <4 x float> %a, %a
   %t2 = extractelement <4 x float> %t1, i32 0
   %t3 = extractelement <4 x float> %t1, i32 1
@@ -28,8 +28,8 @@ define float @bar(<4 x float> %a) {
 define <4 x float> @baz(<4 x float> %a) {
 ; CHECK: .func  (.param .align 16 .b8 func_retval0[16]) baz
 ; CHECK: .param .align 16 .b8 baz_param_0[16]
-; CHECK: ld.param.v4.b32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
-; CHECK: st.param.v4.b32 [func_retval0], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
+; CHECK: ld.param.v4.b32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
+; CHECK: st.param.v4.b32 [func_retval0], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
   %t1 = fmul <4 x float> %a, %a
   ret <4 x float> %t1
 }
diff --git a/llvm/test/CodeGen/NVPTX/vector-loads.ll b/llvm/test/CodeGen/NVPTX/vector-loads.ll
index 825a66ec04b5e..88ff59407a143 100644
--- a/llvm/test/CodeGen/NVPTX/vector-loads.ll
+++ b/llvm/test/CodeGen/NVPTX/vector-loads.ll
@@ -4,13 +4,13 @@
 ; Even though general vector types are not supported in PTX, we can still
 ; optimize loads/stores with pseudo-vector instructions of the form:
 ;
-; ld.v2.f32 {%f0, %f1}, [%r0]
+; ld.v2.f32 {%r0, %r1}, [%r0]
 ;
 ; which will load two floats at once into scalar registers.
 
 ; CHECK-LABEL: foo
 define void @foo(ptr %a) {
-; CHECK: ld.v2.b32 {%f{{[0-9]+}}, %f{{[0-9]+}}}
+; CHECK: ld.v2.b32 {%r{{[0-9]+}}, %r{{[0-9]+}}}
   %t1 = load <2 x float>, ptr %a
   %t2 = fmul <2 x float> %t1, %t1
   store <2 x float> %t2, ptr %a
@@ -19,7 +19,7 @@ define void @foo(ptr %a) {
 
 ; CHECK-LABEL: foo2
 define void @foo2(ptr %a) {
-; CHECK: ld.v4.b32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
+; CHECK: ld.v4.b32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
   %t1 = load <4 x float>, ptr %a
   %t2 = fmul <4 x float> %t1, %t1
   store <4 x float> %t2, ptr %a
@@ -28,8 +28,8 @@ define void @foo2(ptr %a) {
 
 ; CHECK-LABEL: foo3
 define void @foo3(ptr %a) {
-; CHECK: ld.v4.b32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
-; CHECK-NEXT: ld.v4.b32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
+; CHECK: ld.v4.b32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
+; CHECK-NEXT: ld.v4.b32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
   %t1 = load <8 x float>, ptr %a
   %t2 = fmul <8 x float> %t1, %t1
   store <8 x float> %t2, ptr %a
@@ -105,14 +105,14 @@ define void @extv8f16_global_a16(ptr addrspace(1) noalias readonly align 16 %dst
 ; CHECK: mov.b32 {%rs
 ; CHECK: mov.b32 {%rs
 ; CHECK: mov.b32 {%rs
-; CHECK: cvt.f32.f16 %f{{.*}}, %rs
-; CHECK: cvt.f32.f16 %f{{.*}}, %rs
-; CHECK: cvt.f32.f16 %f{{.*}}, %rs
-; CHECK: cvt.f32.f16 %f{{.*}}, %rs
-; CHECK: cvt.f32.f16 %f{{.*}}, %rs
-; CHECK: cvt.f32.f16 %f{{.*}}, %rs
-; CHECK: cvt.f32.f16 %f{{.*}}, %rs
-; CHECK: cvt.f32.f16 %f{{.*}}, %rs
+; CHECK: cvt.f32.f16 %r{{.*}}, %rs
+; CHECK: cvt.f32.f16 %r{{.*}}, %rs
+; CHECK: cvt.f32.f16 %r{{.*}}, %rs
+; CHECK: cvt.f32.f16 %r{{.*}}, %rs
+; CHECK: cvt.f32.f16 %r{{.*}}, %rs
+; CHECK: cvt.f32.f16 %r{{.*}}, %rs
+; CHECK: cvt.f32.f16 %r{{.*}}, %rs
+; CHECK: cvt.f32.f16 %r{{.*}}, %rs
   %ext = fpext <8 x half> %v to <8 x float>
 ; CHECK: st.global.v4.b32
 ; CHECK: st.global.v4.b32
@@ -128,17 +128,17 @@ define void @extv8f16_global_a4(ptr addrspace(1) noalias readonly align 16 %dst,
 ; CHECK: ld.global.b32 %r
   %v = load <8 x half>, ptr addrspace(1) %src, align 4
 ; CHECK: mov.b32 {%rs
-; CHECK: cvt.f32.f16 %f{{.*}}, %rs
-; CHECK: cvt.f32.f16 %f{{.*}}, %rs
+; CHECK: cvt.f32.f16 %r{{.*}}, %rs
+; CHECK: cvt.f32.f16 %r{{.*}}, %rs
 ; CHECK: mov.b32 {%rs
-; CHECK: cvt.f32.f16 %f{{.*}}, %rs
-; CHECK: cvt.f32.f16 %f{{.*}}, %rs
+; CHECK: cvt.f32.f16 %r{{.*}}, %rs
+; CHECK: cvt.f32.f16 %r{{.*}}, %rs
 ; CHECK: mov.b32 {%rs
-; CHECK: cvt.f32.f16 %f{{.*}}, %rs
-; CHECK: cvt.f32.f16 %f{{.*}}, %rs
+; CHECK: cvt.f32.f16 %r{{.*}}, %rs
+; CHECK: cvt.f32.f16 %r{{.*}}, %rs
 ; CHECK: mov.b32 {%rs
-; CHECK: cvt.f32.f16 %f{{.*}}, %rs
-; CHECK: cvt.f32.f16 %f{{.*}}, %rs
+; CHECK: cvt.f32.f16 %r{{.*}}, %rs
+; CHECK: cvt.f32.f16 %r{{.*}}, %rs
   %ext = fpext <8 x half> %v to <8 x float>
 ; CHECK: st.global.v4.b32
 ; CHECK: st.global.v4.b32
@@ -155,14 +155,14 @@ define void @extv8f16_generic_a16(ptr noalias readonly align 16 %dst, ptr noalia
 ; CHECK: mov.b32 {%rs
 ; CHECK: mov.b32 {%rs
 ; CHECK: mov.b32 {%rs
-; CHECK: cvt.f32.f16 %f{{.*}}, %rs
-; CHECK: cvt.f32.f16 %f{{.*}}, %rs
-; CHECK: cvt.f32.f16 %f{{.*}}, %rs
-; CHECK: cvt.f32.f16 %f{{.*}}, %rs
-; CHECK: cvt.f32.f16 %f{{.*}}, %rs
-; CHECK: cvt.f32.f16 %f{{.*}}, %rs
-; CHECK: cvt.f32.f16 %f{{.*}}, %rs
-; CHECK: cvt.f32.f16 %f{{.*}}, %rs
+; CHECK: cvt.f32.f16 %r{{.*}}, %rs
+; CHECK: cvt.f32.f16 %r{{.*}}, %rs
+; CHECK: cvt.f32.f16 %r{{.*}}, %rs
+; CHECK: cvt.f32.f16 %r{{.*}}, %rs
+; CHECK: cvt.f32.f16 %r{{.*}}, %rs
+; CHECK: cvt.f32.f16 %r{{.*}}, %rs
+; CHECK: cvt.f32.f16 %r{{.*}}, %rs
+; CHECK: cvt.f32.f16 %r{{.*}}, %rs
   %ext = fpext <8 x half> %v to <8 x float>
 ; CHECK: st.v4.b32
 ; CHECK: st.v4.b32
@@ -178,17 +178,17 @@ define void @extv8f16_generic_a4(ptr noalias readonly align 16 %dst, ptr noalias
 ; CHECK: ld.b32 %r
   %v = load <8 x half>, ptr %src, align 4
 ; CHECK: mov.b32 {%rs
-; CHECK: cvt.f32.f16 %f{{.*}}, %rs
-; CHECK: cvt.f32.f16 %f{{.*}}, %rs
+; CHECK: cvt.f32.f16 %r{{.*}}, %rs
+; CHECK: cvt.f32.f16 %r{{.*}}, %rs
 ; CHECK: mov.b32 {%rs
-; CHECK: cvt.f32.f16 %f{{.*}}, %rs
-; CHECK: cvt.f32.f16 %f{{.*}}, %rs
+; CHECK: cvt.f32.f16 %r{{.*}}, %rs
+; CHECK: cvt.f32.f16 %r{{.*}}, %rs
 ; CHECK: mov.b32 {%rs
-; CHECK: cvt.f32.f16 %f{{.*}}, %rs
-; CHECK: cvt.f32.f16 %f{{.*}}, %rs
+; CHECK: cvt.f32.f16 %r{{.*}}, %rs
+; CHECK: cvt.f32.f16 %r{{.*}}, %rs
 ; CHECK: mov.b32 {%rs
-; CHECK: cvt.f32.f16 %f{{.*}}, %rs
-; CHECK: cvt.f32.f16 %f{{.*}}, %rs
+; CHECK: cvt.f32.f16 %r{{.*}}, %rs
+; CHECK: cvt.f32.f16 %r{{.*}}, %rs
   %ext = fpext <8 x half> %v to <8 x float>
 ; CHECK: st.v4.b32
 ; CHECK: st.v4.b32
diff --git a/llvm/test/CodeGen/NVPTX/wmma.py b/llvm/test/CodeGen/NVPTX/wmma.py
index ce275c9b71282..2ee489670e9e4 100644
--- a/llvm/test/CodeGen/NVPTX/wmma.py
+++ b/llvm/test/CodeGen/NVPTX/wmma.py
@@ -33,8 +33,8 @@ def __init__(self, ptx_type):
 
         self.ptx_reg_pattern = {
             "f16": "%r[0-9]+",
-            "f32": "%f[0-9]+",
-            "f64": "%fd[0-9]+",
+            "f32": "%r[0-9]+",
+            "f64": "%rd[0-9]+",
         }.get(ptx_type, "%r[0-9]+")
 
     def __repr__(self):



More information about the llvm-commits mailing list