[llvm] dc90f42 - Coalesce 16-bit FP types to use integer register classes.

Artem Belevich via llvm-commits llvm-commits at lists.llvm.org
Mon Jun 5 12:22:12 PDT 2023


Author: Artem Belevich
Date: 2023-06-05T12:21:52-07:00
New Revision: dc90f42ea7b4f6d9e643f5ad2ba663eba2f9e421

URL: https://github.com/llvm/llvm-project/commit/dc90f42ea7b4f6d9e643f5ad2ba663eba2f9e421
DIFF: https://github.com/llvm/llvm-project/commit/dc90f42ea7b4f6d9e643f5ad2ba663eba2f9e421.diff

LOG: Coalesce 16-bit FP types to use integer register classes.

i16/f16/bf16 will use the same .b16 registers and
i32/v2f16 and v2bf16 will share .b32 registers.

The changes are mostly mechanical, intended to remove unnecessary register
classes which tend to produce redundant register moves.

Differential Revision: https://reviews.llvm.org/D151601

v2f16 regtype conversion to i32

Added: 
    

Modified: 
    clang/test/CodeGenCUDA/bf16.cu
    llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
    llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
    llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
    llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
    llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
    llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
    llvm/lib/Target/NVPTX/NVPTXProxyRegErasure.cpp
    llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
    llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
    llvm/test/CodeGen/NVPTX/bf16.ll
    llvm/test/CodeGen/NVPTX/call_bitcast_byval.ll
    llvm/test/CodeGen/NVPTX/f16-instructions.ll
    llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
    llvm/test/CodeGen/NVPTX/half.ll
    llvm/test/CodeGen/NVPTX/intrinsics-sm90.ll
    llvm/test/CodeGen/NVPTX/ld-st-addrrspace.py
    llvm/test/CodeGen/NVPTX/ldg-invariant.ll
    llvm/test/CodeGen/NVPTX/ldu-ldg.ll
    llvm/test/CodeGen/NVPTX/param-load-store.ll
    llvm/test/CodeGen/NVPTX/proxy-reg-erasure-ptx.ll
    llvm/test/CodeGen/NVPTX/wmma.py

Removed: 
    


################################################################################
diff  --git a/clang/test/CodeGenCUDA/bf16.cu b/clang/test/CodeGenCUDA/bf16.cu
index 0b4f375b95681..32082904c4d81 100644
--- a/clang/test/CodeGenCUDA/bf16.cu
+++ b/clang/test/CodeGenCUDA/bf16.cu
@@ -2,7 +2,7 @@
 // REQUIRES: x86-registered-target
 
 // RUN: %clang_cc1 "-aux-triple" "x86_64-unknown-linux-gnu" "-triple" "nvptx64-nvidia-cuda" \
-// RUN:    -fcuda-is-device "-aux-target-cpu" "x86-64" -S -o - %s | FileCheck %s
+// RUN:    -fcuda-is-device "-aux-target-cpu" "x86-64" -O1 -S -o - %s | FileCheck %s
 
 #include "Inputs/cuda.h"
 
@@ -11,10 +11,11 @@
 // CHECK:        .param .b16 _Z8test_argPDF16bDF16b_param_1
 //
 __device__ void test_arg(__bf16 *out, __bf16 in) {
-// CHECK:         ld.param.b16    %{{h.*}}, [_Z8test_argPDF16bDF16b_param_1];
+// CHECK-DAG:     ld.param.u64  %[[A:rd[0-9]+]], [_Z8test_argPDF16bDF16b_param_0];
+// CHECK-DAG:     ld.param.b16  %[[R:rs[0-9]+]], [_Z8test_argPDF16bDF16b_param_1];
   __bf16 bf16 = in;
   *out = bf16;
-// CHECK:         st.b16
+// CHECK:         st.b16         [%[[A]]], %[[R]]
 // CHECK:         ret;
 }
 
@@ -22,25 +23,27 @@ __device__ void test_arg(__bf16 *out, __bf16 in) {
 // CHECK-LABEL: .visible .func (.param .b32 func_retval0) _Z8test_retDF16b(
 // CHECK:         .param .b16 _Z8test_retDF16b_param_0
 __device__ __bf16 test_ret( __bf16 in) {
-// CHECK:        ld.param.b16    %h{{.*}}, [_Z8test_retDF16b_param_0];
+// CHECK:        ld.param.b16    %[[R:rs[0-9]+]], [_Z8test_retDF16b_param_0];
   return in;
-// CHECK:        st.param.b16    [func_retval0+0], %h
+// CHECK:        st.param.b16    [func_retval0+0], %[[R]]
 // CHECK:        ret;
 }
 
+__device__ __bf16 external_func( __bf16 in);
+
 // CHECK-LABEL: .visible .func  (.param .b32 func_retval0) _Z9test_callDF16b(
 // CHECK:        .param .b16 _Z9test_callDF16b_param_0
 __device__ __bf16 test_call( __bf16 in) {
-// CHECK:        ld.param.b16    %h{{.*}}, [_Z9test_callDF16b_param_0];
-// CHECK:        st.param.b16    [param0+0], %h2;
+// CHECK:        ld.param.b16    %[[R:rs[0-9]+]], [_Z9test_callDF16b_param_0];
+// CHECK:        st.param.b16    [param0+0], %[[R]];
 // CHECK:        .param .b32 retval0;
 // CHECK:        call.uni (retval0),
-// CHECK-NEXT:   _Z8test_retDF16b,
+// CHECK-NEXT:   _Z13external_funcDF16b,
 // CHECK-NEXT:   (
 // CHECK-NEXT:   param0
 // CHECK-NEXT    );
-// CHECK:        ld.param.b16    %h{{.*}}, [retval0+0];
-  return test_ret(in);
-// CHECK:        st.param.b16    [func_retval0+0], %h
+// CHECK:        ld.param.b16    %[[RET:rs[0-9]+]], [retval0+0];
+  return external_func(in);
+// CHECK:        st.param.b16    [func_retval0+0], %[[RET]]
 // CHECK:        ret;
 }

diff  --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 5fce6d67f3067..179306b59b0ff 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -309,10 +309,6 @@ unsigned NVPTXAsmPrinter::encodeVirtualRegister(unsigned Reg) {
       Ret = (5 << 28);
     } else if (RC == &NVPTX::Float64RegsRegClass) {
       Ret = (6 << 28);
-    } else if (RC == &NVPTX::Float16RegsRegClass) {
-      Ret = (7 << 28);
-    } else if (RC == &NVPTX::Float16x2RegsRegClass) {
-      Ret = (8 << 28);
     } else {
       report_fatal_error("Bad register class");
     }

diff  --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 10b633d1b9422..686a8d9f5448f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -816,8 +816,7 @@ void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
 static std::optional<unsigned>
 pickOpcodeForVT(MVT::SimpleValueType VT, unsigned Opcode_i8,
                 unsigned Opcode_i16, unsigned Opcode_i32,
-                std::optional<unsigned> Opcode_i64, unsigned Opcode_f16,
-                unsigned Opcode_f16x2, unsigned Opcode_f32,
+                std::optional<unsigned> Opcode_i64, unsigned Opcode_f32,
                 std::optional<unsigned> Opcode_f64) {
   switch (VT) {
   case MVT::i1:
@@ -831,10 +830,10 @@ pickOpcodeForVT(MVT::SimpleValueType VT, unsigned Opcode_i8,
     return Opcode_i64;
   case MVT::f16:
   case MVT::bf16:
-    return Opcode_f16;
+    return Opcode_i16;
   case MVT::v2f16:
   case MVT::v2bf16:
-    return Opcode_f16x2;
+    return Opcode_i32;
   case MVT::f32:
     return Opcode_f32;
   case MVT::f64:
@@ -935,10 +934,9 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
   MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
 
   if (SelectDirectAddr(N1, Addr)) {
-    Opcode = pickOpcodeForVT(
-        TargetVT, NVPTX::LD_i8_avar, NVPTX::LD_i16_avar, NVPTX::LD_i32_avar,
-        NVPTX::LD_i64_avar, NVPTX::LD_f16_avar, NVPTX::LD_f16x2_avar,
-        NVPTX::LD_f32_avar, NVPTX::LD_f64_avar);
+    Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_avar, NVPTX::LD_i16_avar,
+                             NVPTX::LD_i32_avar, NVPTX::LD_i64_avar,
+                             NVPTX::LD_f32_avar, NVPTX::LD_f64_avar);
     if (!Opcode)
       return false;
     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
@@ -948,9 +946,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
   } else if (PointerSize == 64 ? SelectADDRsi64(N1.getNode(), N1, Base, Offset)
                                : SelectADDRsi(N1.getNode(), N1, Base, Offset)) {
     Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_asi, NVPTX::LD_i16_asi,
-                                 NVPTX::LD_i32_asi, NVPTX::LD_i64_asi,
-                                 NVPTX::LD_f16_asi, NVPTX::LD_f16x2_asi,
-                                 NVPTX::LD_f32_asi, NVPTX::LD_f64_asi);
+                             NVPTX::LD_i32_asi, NVPTX::LD_i64_asi,
+                             NVPTX::LD_f32_asi, NVPTX::LD_f64_asi);
     if (!Opcode)
       return false;
     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
@@ -960,15 +957,14 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
   } else if (PointerSize == 64 ? SelectADDRri64(N1.getNode(), N1, Base, Offset)
                                : SelectADDRri(N1.getNode(), N1, Base, Offset)) {
     if (PointerSize == 64)
-      Opcode = pickOpcodeForVT(
-          TargetVT, NVPTX::LD_i8_ari_64, NVPTX::LD_i16_ari_64,
-          NVPTX::LD_i32_ari_64, NVPTX::LD_i64_ari_64, NVPTX::LD_f16_ari_64,
-          NVPTX::LD_f16x2_ari_64, NVPTX::LD_f32_ari_64, NVPTX::LD_f64_ari_64);
+      Opcode =
+          pickOpcodeForVT(TargetVT, NVPTX::LD_i8_ari_64, NVPTX::LD_i16_ari_64,
+                          NVPTX::LD_i32_ari_64, NVPTX::LD_i64_ari_64,
+                          NVPTX::LD_f32_ari_64, NVPTX::LD_f64_ari_64);
     else
-      Opcode = pickOpcodeForVT(
-          TargetVT, NVPTX::LD_i8_ari, NVPTX::LD_i16_ari, NVPTX::LD_i32_ari,
-          NVPTX::LD_i64_ari, NVPTX::LD_f16_ari, NVPTX::LD_f16x2_ari,
-          NVPTX::LD_f32_ari, NVPTX::LD_f64_ari);
+      Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_ari, NVPTX::LD_i16_ari,
+                               NVPTX::LD_i32_ari, NVPTX::LD_i64_ari,
+                               NVPTX::LD_f32_ari, NVPTX::LD_f64_ari);
     if (!Opcode)
       return false;
     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
@@ -977,16 +973,14 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
     NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
   } else {
     if (PointerSize == 64)
-      Opcode = pickOpcodeForVT(
-          TargetVT, NVPTX::LD_i8_areg_64, NVPTX::LD_i16_areg_64,
-          NVPTX::LD_i32_areg_64, NVPTX::LD_i64_areg_64, NVPTX::LD_f16_areg_64,
-          NVPTX::LD_f16x2_areg_64, NVPTX::LD_f32_areg_64,
-          NVPTX::LD_f64_areg_64);
+      Opcode =
+          pickOpcodeForVT(TargetVT, NVPTX::LD_i8_areg_64, NVPTX::LD_i16_areg_64,
+                          NVPTX::LD_i32_areg_64, NVPTX::LD_i64_areg_64,
+                          NVPTX::LD_f32_areg_64, NVPTX::LD_f64_areg_64);
     else
-      Opcode = pickOpcodeForVT(
-          TargetVT, NVPTX::LD_i8_areg, NVPTX::LD_i16_areg, NVPTX::LD_i32_areg,
-          NVPTX::LD_i64_areg, NVPTX::LD_f16_areg, NVPTX::LD_f16x2_areg,
-          NVPTX::LD_f32_areg, NVPTX::LD_f64_areg);
+      Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_areg, NVPTX::LD_i16_areg,
+                               NVPTX::LD_i32_areg, NVPTX::LD_i64_areg,
+                               NVPTX::LD_f32_areg, NVPTX::LD_f64_areg);
     if (!Opcode)
       return false;
     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
@@ -1090,15 +1084,13 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
                                NVPTX::LDV_i8_v2_avar, NVPTX::LDV_i16_v2_avar,
                                NVPTX::LDV_i32_v2_avar, NVPTX::LDV_i64_v2_avar,
-                               NVPTX::LDV_f16_v2_avar, NVPTX::LDV_f16x2_v2_avar,
                                NVPTX::LDV_f32_v2_avar, NVPTX::LDV_f64_v2_avar);
       break;
     case NVPTXISD::LoadV4:
-      Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
-                               NVPTX::LDV_i8_v4_avar, NVPTX::LDV_i16_v4_avar,
-                               NVPTX::LDV_i32_v4_avar, std::nullopt,
-                               NVPTX::LDV_f16_v4_avar, NVPTX::LDV_f16x2_v4_avar,
-                               NVPTX::LDV_f32_v4_avar, std::nullopt);
+      Opcode =
+          pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_avar,
+                          NVPTX::LDV_i16_v4_avar, NVPTX::LDV_i32_v4_avar,
+                          std::nullopt, NVPTX::LDV_f32_v4_avar, std::nullopt);
       break;
     }
     if (!Opcode)
@@ -1117,15 +1109,13 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
                                NVPTX::LDV_i8_v2_asi, NVPTX::LDV_i16_v2_asi,
                                NVPTX::LDV_i32_v2_asi, NVPTX::LDV_i64_v2_asi,
-                               NVPTX::LDV_f16_v2_asi, NVPTX::LDV_f16x2_v2_asi,
                                NVPTX::LDV_f32_v2_asi, NVPTX::LDV_f64_v2_asi);
       break;
     case NVPTXISD::LoadV4:
-      Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
-                               NVPTX::LDV_i8_v4_asi, NVPTX::LDV_i16_v4_asi,
-                               NVPTX::LDV_i32_v4_asi, std::nullopt,
-                               NVPTX::LDV_f16_v4_asi, NVPTX::LDV_f16x2_v4_asi,
-                               NVPTX::LDV_f32_v4_asi, std::nullopt);
+      Opcode =
+          pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_asi,
+                          NVPTX::LDV_i16_v4_asi, NVPTX::LDV_i32_v4_asi,
+                          std::nullopt, NVPTX::LDV_f32_v4_asi, std::nullopt);
       break;
     }
     if (!Opcode)
@@ -1142,18 +1132,16 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
       default:
         return false;
       case NVPTXISD::LoadV2:
-        Opcode = pickOpcodeForVT(
-            EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_ari_64,
-            NVPTX::LDV_i16_v2_ari_64, NVPTX::LDV_i32_v2_ari_64,
-            NVPTX::LDV_i64_v2_ari_64, NVPTX::LDV_f16_v2_ari_64,
-            NVPTX::LDV_f16x2_v2_ari_64, NVPTX::LDV_f32_v2_ari_64,
-            NVPTX::LDV_f64_v2_ari_64);
+        Opcode =
+            pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                            NVPTX::LDV_i8_v2_ari_64, NVPTX::LDV_i16_v2_ari_64,
+                            NVPTX::LDV_i32_v2_ari_64, NVPTX::LDV_i64_v2_ari_64,
+                            NVPTX::LDV_f32_v2_ari_64, NVPTX::LDV_f64_v2_ari_64);
         break;
       case NVPTXISD::LoadV4:
         Opcode = pickOpcodeForVT(
             EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari_64,
             NVPTX::LDV_i16_v4_ari_64, NVPTX::LDV_i32_v4_ari_64, std::nullopt,
-            NVPTX::LDV_f16_v4_ari_64, NVPTX::LDV_f16x2_v4_ari_64,
             NVPTX::LDV_f32_v4_ari_64, std::nullopt);
         break;
       }
@@ -1165,15 +1153,13 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
                                  NVPTX::LDV_i8_v2_ari, NVPTX::LDV_i16_v2_ari,
                                  NVPTX::LDV_i32_v2_ari, NVPTX::LDV_i64_v2_ari,
-                                 NVPTX::LDV_f16_v2_ari, NVPTX::LDV_f16x2_v2_ari,
                                  NVPTX::LDV_f32_v2_ari, NVPTX::LDV_f64_v2_ari);
         break;
       case NVPTXISD::LoadV4:
-        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
-                                 NVPTX::LDV_i8_v4_ari, NVPTX::LDV_i16_v4_ari,
-                                 NVPTX::LDV_i32_v4_ari, std::nullopt,
-                                 NVPTX::LDV_f16_v4_ari, NVPTX::LDV_f16x2_v4_ari,
-                                 NVPTX::LDV_f32_v4_ari, std::nullopt);
+        Opcode =
+            pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari,
+                            NVPTX::LDV_i16_v4_ari, NVPTX::LDV_i32_v4_ari,
+                            std::nullopt, NVPTX::LDV_f32_v4_ari, std::nullopt);
         break;
       }
     }
@@ -1193,15 +1179,13 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
         Opcode = pickOpcodeForVT(
             EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg_64,
             NVPTX::LDV_i16_v2_areg_64, NVPTX::LDV_i32_v2_areg_64,
-            NVPTX::LDV_i64_v2_areg_64, NVPTX::LDV_f16_v2_areg_64,
-            NVPTX::LDV_f16x2_v2_areg_64, NVPTX::LDV_f32_v2_areg_64,
+            NVPTX::LDV_i64_v2_areg_64, NVPTX::LDV_f32_v2_areg_64,
             NVPTX::LDV_f64_v2_areg_64);
         break;
       case NVPTXISD::LoadV4:
         Opcode = pickOpcodeForVT(
             EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg_64,
             NVPTX::LDV_i16_v4_areg_64, NVPTX::LDV_i32_v4_areg_64, std::nullopt,
-            NVPTX::LDV_f16_v4_areg_64, NVPTX::LDV_f16x2_v4_areg_64,
             NVPTX::LDV_f32_v4_areg_64, std::nullopt);
         break;
       }
@@ -1213,16 +1197,14 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
         Opcode =
             pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg,
                             NVPTX::LDV_i16_v2_areg, NVPTX::LDV_i32_v2_areg,
-                            NVPTX::LDV_i64_v2_areg, NVPTX::LDV_f16_v2_areg,
-                            NVPTX::LDV_f16x2_v2_areg, NVPTX::LDV_f32_v2_areg,
+                            NVPTX::LDV_i64_v2_areg, NVPTX::LDV_f32_v2_areg,
                             NVPTX::LDV_f64_v2_areg);
         break;
       case NVPTXISD::LoadV4:
-        Opcode = pickOpcodeForVT(
-            EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg,
-            NVPTX::LDV_i16_v4_areg, NVPTX::LDV_i32_v4_areg, std::nullopt,
-            NVPTX::LDV_f16_v4_areg, NVPTX::LDV_f16x2_v4_areg,
-            NVPTX::LDV_f32_v4_areg, std::nullopt);
+        Opcode =
+            pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg,
+                            NVPTX::LDV_i16_v4_areg, NVPTX::LDV_i32_v4_areg,
+                            std::nullopt, NVPTX::LDV_f32_v4_areg, std::nullopt);
         break;
       }
     }
@@ -1310,47 +1292,39 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
     case ISD::INTRINSIC_W_CHAIN:
       if (IsLDG)
         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
-                                     NVPTX::INT_PTX_LDG_GLOBAL_i8avar,
-                                     NVPTX::INT_PTX_LDG_GLOBAL_i16avar,
-                                     NVPTX::INT_PTX_LDG_GLOBAL_i32avar,
-                                     NVPTX::INT_PTX_LDG_GLOBAL_i64avar,
-                                     NVPTX::INT_PTX_LDG_GLOBAL_f16avar,
-                                     NVPTX::INT_PTX_LDG_GLOBAL_f16x2avar,
-                                     NVPTX::INT_PTX_LDG_GLOBAL_f32avar,
-                                     NVPTX::INT_PTX_LDG_GLOBAL_f64avar);
+                                 NVPTX::INT_PTX_LDG_GLOBAL_i8avar,
+                                 NVPTX::INT_PTX_LDG_GLOBAL_i16avar,
+                                 NVPTX::INT_PTX_LDG_GLOBAL_i32avar,
+                                 NVPTX::INT_PTX_LDG_GLOBAL_i64avar,
+                                 NVPTX::INT_PTX_LDG_GLOBAL_f32avar,
+                                 NVPTX::INT_PTX_LDG_GLOBAL_f64avar);
       else
         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
-                                     NVPTX::INT_PTX_LDU_GLOBAL_i8avar,
-                                     NVPTX::INT_PTX_LDU_GLOBAL_i16avar,
-                                     NVPTX::INT_PTX_LDU_GLOBAL_i32avar,
-                                     NVPTX::INT_PTX_LDU_GLOBAL_i64avar,
-                                     NVPTX::INT_PTX_LDU_GLOBAL_f16avar,
-                                     NVPTX::INT_PTX_LDU_GLOBAL_f16x2avar,
-                                     NVPTX::INT_PTX_LDU_GLOBAL_f32avar,
-                                     NVPTX::INT_PTX_LDU_GLOBAL_f64avar);
+                                 NVPTX::INT_PTX_LDU_GLOBAL_i8avar,
+                                 NVPTX::INT_PTX_LDU_GLOBAL_i16avar,
+                                 NVPTX::INT_PTX_LDU_GLOBAL_i32avar,
+                                 NVPTX::INT_PTX_LDU_GLOBAL_i64avar,
+                                 NVPTX::INT_PTX_LDU_GLOBAL_f32avar,
+                                 NVPTX::INT_PTX_LDU_GLOBAL_f64avar);
       break;
     case NVPTXISD::LoadV2:
     case NVPTXISD::LDGV2:
       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
-                                   NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar,
-                                   NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar,
-                                   NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar,
-                                   NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar,
-                                   NVPTX::INT_PTX_LDG_G_v2f16_ELE_avar,
-                                   NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_avar,
-                                   NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar,
-                                   NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar);
+                               NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar,
+                               NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar,
+                               NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar,
+                               NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar,
+                               NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar,
+                               NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar);
       break;
     case NVPTXISD::LDUV2:
       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
-                                   NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar,
-                                   NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar,
-                                   NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar,
-                                   NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar,
-                                   NVPTX::INT_PTX_LDU_G_v2f16_ELE_avar,
-                                   NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_avar,
-                                   NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar,
-                                   NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar);
+                               NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar,
+                               NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar,
+                               NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar,
+                               NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar,
+                               NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar,
+                               NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar);
       break;
     case NVPTXISD::LoadV4:
     case NVPTXISD::LDGV4:
@@ -1358,8 +1332,6 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
           EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar,
           NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar,
           NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar, std::nullopt,
-          NVPTX::INT_PTX_LDG_G_v4f16_ELE_avar,
-          NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_avar,
           NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar, std::nullopt);
       break;
     case NVPTXISD::LDUV4:
@@ -1367,8 +1339,6 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
           EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar,
           NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar,
           NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar, std::nullopt,
-          NVPTX::INT_PTX_LDU_G_v4f16_ELE_avar,
-          NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_avar,
           NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar, std::nullopt);
       break;
     }
@@ -1390,8 +1360,6 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
                                        NVPTX::INT_PTX_LDG_GLOBAL_i16ari64,
                                        NVPTX::INT_PTX_LDG_GLOBAL_i32ari64,
                                        NVPTX::INT_PTX_LDG_GLOBAL_i64ari64,
-                                       NVPTX::INT_PTX_LDG_GLOBAL_f16ari64,
-                                       NVPTX::INT_PTX_LDG_GLOBAL_f16x2ari64,
                                        NVPTX::INT_PTX_LDG_GLOBAL_f32ari64,
                                        NVPTX::INT_PTX_LDG_GLOBAL_f64ari64);
         else
@@ -1400,8 +1368,6 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
                                        NVPTX::INT_PTX_LDU_GLOBAL_i16ari64,
                                        NVPTX::INT_PTX_LDU_GLOBAL_i32ari64,
                                        NVPTX::INT_PTX_LDU_GLOBAL_i64ari64,
-                                       NVPTX::INT_PTX_LDU_GLOBAL_f16ari64,
-                                       NVPTX::INT_PTX_LDU_GLOBAL_f16x2ari64,
                                        NVPTX::INT_PTX_LDU_GLOBAL_f32ari64,
                                        NVPTX::INT_PTX_LDU_GLOBAL_f64ari64);
         break;
@@ -1412,8 +1378,6 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
                                      NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari64,
                                      NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari64,
                                      NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari64,
-                                     NVPTX::INT_PTX_LDG_G_v2f16_ELE_ari64,
-                                     NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_ari64,
                                      NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari64,
                                      NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari64);
         break;
@@ -1423,8 +1387,6 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
                                      NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari64,
                                      NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari64,
                                      NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari64,
-                                     NVPTX::INT_PTX_LDU_G_v2f16_ELE_ari64,
-                                     NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_ari64,
                                      NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari64,
                                      NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari64);
         break;
@@ -1434,8 +1396,6 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
             EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64,
             NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari64,
             NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari64, std::nullopt,
-            NVPTX::INT_PTX_LDG_G_v4f16_ELE_ari64,
-            NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_ari64,
             NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari64, std::nullopt);
         break;
       case NVPTXISD::LDUV4:
@@ -1443,8 +1403,6 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
             EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64,
             NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari64,
             NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari64, std::nullopt,
-            NVPTX::INT_PTX_LDU_G_v4f16_ELE_ari64,
-            NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_ari64,
             NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari64, std::nullopt);
         break;
       }
@@ -1456,47 +1414,39 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
       case ISD::INTRINSIC_W_CHAIN:
         if (IsLDG)
           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
-                                       NVPTX::INT_PTX_LDG_GLOBAL_i8ari,
-                                       NVPTX::INT_PTX_LDG_GLOBAL_i16ari,
-                                       NVPTX::INT_PTX_LDG_GLOBAL_i32ari,
-                                       NVPTX::INT_PTX_LDG_GLOBAL_i64ari,
-                                       NVPTX::INT_PTX_LDG_GLOBAL_f16ari,
-                                       NVPTX::INT_PTX_LDG_GLOBAL_f16x2ari,
-                                       NVPTX::INT_PTX_LDG_GLOBAL_f32ari,
-                                       NVPTX::INT_PTX_LDG_GLOBAL_f64ari);
+                                   NVPTX::INT_PTX_LDG_GLOBAL_i8ari,
+                                   NVPTX::INT_PTX_LDG_GLOBAL_i16ari,
+                                   NVPTX::INT_PTX_LDG_GLOBAL_i32ari,
+                                   NVPTX::INT_PTX_LDG_GLOBAL_i64ari,
+                                   NVPTX::INT_PTX_LDG_GLOBAL_f32ari,
+                                   NVPTX::INT_PTX_LDG_GLOBAL_f64ari);
         else
           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
-                                       NVPTX::INT_PTX_LDU_GLOBAL_i8ari,
-                                       NVPTX::INT_PTX_LDU_GLOBAL_i16ari,
-                                       NVPTX::INT_PTX_LDU_GLOBAL_i32ari,
-                                       NVPTX::INT_PTX_LDU_GLOBAL_i64ari,
-                                       NVPTX::INT_PTX_LDU_GLOBAL_f16ari,
-                                       NVPTX::INT_PTX_LDU_GLOBAL_f16x2ari,
-                                       NVPTX::INT_PTX_LDU_GLOBAL_f32ari,
-                                       NVPTX::INT_PTX_LDU_GLOBAL_f64ari);
+                                   NVPTX::INT_PTX_LDU_GLOBAL_i8ari,
+                                   NVPTX::INT_PTX_LDU_GLOBAL_i16ari,
+                                   NVPTX::INT_PTX_LDU_GLOBAL_i32ari,
+                                   NVPTX::INT_PTX_LDU_GLOBAL_i64ari,
+                                   NVPTX::INT_PTX_LDU_GLOBAL_f32ari,
+                                   NVPTX::INT_PTX_LDU_GLOBAL_f64ari);
         break;
       case NVPTXISD::LoadV2:
       case NVPTXISD::LDGV2:
         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
-                                     NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32,
-                                     NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari32,
-                                     NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari32,
-                                     NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari32,
-                                     NVPTX::INT_PTX_LDG_G_v2f16_ELE_ari32,
-                                     NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_ari32,
-                                     NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari32,
-                                     NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari32);
+                                 NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32,
+                                 NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari32,
+                                 NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari32,
+                                 NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari32,
+                                 NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari32,
+                                 NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari32);
         break;
       case NVPTXISD::LDUV2:
         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
-                                     NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32,
-                                     NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari32,
-                                     NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari32,
-                                     NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari32,
-                                     NVPTX::INT_PTX_LDU_G_v2f16_ELE_ari32,
-                                     NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_ari32,
-                                     NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari32,
-                                     NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari32);
+                                 NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32,
+                                 NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari32,
+                                 NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari32,
+                                 NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari32,
+                                 NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari32,
+                                 NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari32);
         break;
       case NVPTXISD::LoadV4:
       case NVPTXISD::LDGV4:
@@ -1504,8 +1454,6 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
             EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32,
             NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari32,
             NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari32, std::nullopt,
-            NVPTX::INT_PTX_LDG_G_v4f16_ELE_ari32,
-            NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_ari32,
             NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari32, std::nullopt);
         break;
       case NVPTXISD::LDUV4:
@@ -1513,8 +1461,6 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
             EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32,
             NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari32,
             NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari32, std::nullopt,
-            NVPTX::INT_PTX_LDU_G_v4f16_ELE_ari32,
-            NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_ari32,
             NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari32, std::nullopt);
         break;
       }
@@ -1536,8 +1482,6 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
                                        NVPTX::INT_PTX_LDG_GLOBAL_i16areg64,
                                        NVPTX::INT_PTX_LDG_GLOBAL_i32areg64,
                                        NVPTX::INT_PTX_LDG_GLOBAL_i64areg64,
-                                       NVPTX::INT_PTX_LDG_GLOBAL_f16areg64,
-                                       NVPTX::INT_PTX_LDG_GLOBAL_f16x2areg64,
                                        NVPTX::INT_PTX_LDG_GLOBAL_f32areg64,
                                        NVPTX::INT_PTX_LDG_GLOBAL_f64areg64);
         else
@@ -1546,8 +1490,6 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
                                        NVPTX::INT_PTX_LDU_GLOBAL_i16areg64,
                                        NVPTX::INT_PTX_LDU_GLOBAL_i32areg64,
                                        NVPTX::INT_PTX_LDU_GLOBAL_i64areg64,
-                                       NVPTX::INT_PTX_LDU_GLOBAL_f16areg64,
-                                       NVPTX::INT_PTX_LDU_GLOBAL_f16x2areg64,
                                        NVPTX::INT_PTX_LDU_GLOBAL_f32areg64,
                                        NVPTX::INT_PTX_LDU_GLOBAL_f64areg64);
         break;
@@ -1558,8 +1500,6 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
                                      NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg64,
                                      NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg64,
                                      NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg64,
-                                     NVPTX::INT_PTX_LDG_G_v2f16_ELE_areg64,
-                                     NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_areg64,
                                      NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg64,
                                      NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg64);
         break;
@@ -1569,8 +1509,6 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
                                      NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg64,
                                      NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg64,
                                      NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg64,
-                                     NVPTX::INT_PTX_LDU_G_v2f16_ELE_areg64,
-                                     NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_areg64,
                                      NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg64,
                                      NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg64);
         break;
@@ -1580,8 +1518,6 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
             EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64,
             NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg64,
             NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg64, std::nullopt,
-            NVPTX::INT_PTX_LDG_G_v4f16_ELE_areg64,
-            NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_areg64,
             NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg64, std::nullopt);
         break;
       case NVPTXISD::LDUV4:
@@ -1589,8 +1525,6 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
             EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64,
             NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg64,
             NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg64, std::nullopt,
-            NVPTX::INT_PTX_LDU_G_v4f16_ELE_areg64,
-            NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_areg64,
             NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg64, std::nullopt);
         break;
       }
@@ -1606,8 +1540,6 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
                                    NVPTX::INT_PTX_LDG_GLOBAL_i16areg,
                                    NVPTX::INT_PTX_LDG_GLOBAL_i32areg,
                                    NVPTX::INT_PTX_LDG_GLOBAL_i64areg,
-                                   NVPTX::INT_PTX_LDG_GLOBAL_f16areg,
-                                   NVPTX::INT_PTX_LDG_GLOBAL_f16x2areg,
                                    NVPTX::INT_PTX_LDG_GLOBAL_f32areg,
                                    NVPTX::INT_PTX_LDG_GLOBAL_f64areg);
         else
@@ -1616,8 +1548,6 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
                                    NVPTX::INT_PTX_LDU_GLOBAL_i16areg,
                                    NVPTX::INT_PTX_LDU_GLOBAL_i32areg,
                                    NVPTX::INT_PTX_LDU_GLOBAL_i64areg,
-                                   NVPTX::INT_PTX_LDU_GLOBAL_f16areg,
-                                   NVPTX::INT_PTX_LDU_GLOBAL_f16x2areg,
                                    NVPTX::INT_PTX_LDU_GLOBAL_f32areg,
                                    NVPTX::INT_PTX_LDU_GLOBAL_f64areg);
         break;
@@ -1628,8 +1558,6 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
                                  NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg32,
                                  NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg32,
                                  NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg32,
-                                 NVPTX::INT_PTX_LDG_G_v2f16_ELE_areg32,
-                                 NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_areg32,
                                  NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg32,
                                  NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg32);
         break;
@@ -1639,8 +1567,6 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
                                  NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg32,
                                  NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg32,
                                  NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg32,
-                                 NVPTX::INT_PTX_LDU_G_v2f16_ELE_areg32,
-                                 NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_areg32,
                                  NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg32,
                                  NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg32);
         break;
@@ -1650,8 +1576,6 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
             EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32,
             NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg32,
             NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg32, std::nullopt,
-            NVPTX::INT_PTX_LDG_G_v4f16_ELE_areg32,
-            NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_areg32,
             NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg32, std::nullopt);
         break;
       case NVPTXISD::LDUV4:
@@ -1659,8 +1583,6 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
             EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32,
             NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg32,
             NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg32, std::nullopt,
-            NVPTX::INT_PTX_LDU_G_v4f16_ELE_areg32,
-            NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_areg32,
             NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg32, std::nullopt);
         break;
       }
@@ -1781,7 +1703,6 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
   if (SelectDirectAddr(BasePtr, Addr)) {
     Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_avar, NVPTX::ST_i16_avar,
                              NVPTX::ST_i32_avar, NVPTX::ST_i64_avar,
-                             NVPTX::ST_f16_avar, NVPTX::ST_f16x2_avar,
                              NVPTX::ST_f32_avar, NVPTX::ST_f64_avar);
     if (!Opcode)
       return false;
@@ -1799,7 +1720,6 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
                  : SelectADDRsi(BasePtr.getNode(), BasePtr, Base, Offset)) {
     Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_asi, NVPTX::ST_i16_asi,
                              NVPTX::ST_i32_asi, NVPTX::ST_i64_asi,
-                             NVPTX::ST_f16_asi, NVPTX::ST_f16x2_asi,
                              NVPTX::ST_f32_asi, NVPTX::ST_f64_asi);
     if (!Opcode)
       return false;
@@ -1817,14 +1737,13 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
                  ? SelectADDRri64(BasePtr.getNode(), BasePtr, Base, Offset)
                  : SelectADDRri(BasePtr.getNode(), BasePtr, Base, Offset)) {
     if (PointerSize == 64)
-      Opcode = pickOpcodeForVT(
-          SourceVT, NVPTX::ST_i8_ari_64, NVPTX::ST_i16_ari_64,
-          NVPTX::ST_i32_ari_64, NVPTX::ST_i64_ari_64, NVPTX::ST_f16_ari_64,
-          NVPTX::ST_f16x2_ari_64, NVPTX::ST_f32_ari_64, NVPTX::ST_f64_ari_64);
+      Opcode =
+          pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari_64, NVPTX::ST_i16_ari_64,
+                          NVPTX::ST_i32_ari_64, NVPTX::ST_i64_ari_64,
+                          NVPTX::ST_f32_ari_64, NVPTX::ST_f64_ari_64);
     else
       Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari, NVPTX::ST_i16_ari,
                                NVPTX::ST_i32_ari, NVPTX::ST_i64_ari,
-                               NVPTX::ST_f16_ari, NVPTX::ST_f16x2_ari,
                                NVPTX::ST_f32_ari, NVPTX::ST_f64_ari);
     if (!Opcode)
       return false;
@@ -1844,12 +1763,10 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
       Opcode =
           pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg_64, NVPTX::ST_i16_areg_64,
                           NVPTX::ST_i32_areg_64, NVPTX::ST_i64_areg_64,
-                          NVPTX::ST_f16_areg_64, NVPTX::ST_f16x2_areg_64,
                           NVPTX::ST_f32_areg_64, NVPTX::ST_f64_areg_64);
     else
       Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg, NVPTX::ST_i16_areg,
                                NVPTX::ST_i32_areg, NVPTX::ST_i64_areg,
-                               NVPTX::ST_f16_areg, NVPTX::ST_f16x2_areg,
                                NVPTX::ST_f32_areg, NVPTX::ST_f64_areg);
     if (!Opcode)
       return false;
@@ -1955,14 +1872,12 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
                                NVPTX::STV_i8_v2_avar, NVPTX::STV_i16_v2_avar,
                                NVPTX::STV_i32_v2_avar, NVPTX::STV_i64_v2_avar,
-                               NVPTX::STV_f16_v2_avar, NVPTX::STV_f16x2_v2_avar,
                                NVPTX::STV_f32_v2_avar, NVPTX::STV_f64_v2_avar);
       break;
     case NVPTXISD::StoreV4:
       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
                                NVPTX::STV_i8_v4_avar, NVPTX::STV_i16_v4_avar,
                                NVPTX::STV_i32_v4_avar, std::nullopt,
-                               NVPTX::STV_f16_v4_avar, NVPTX::STV_f16x2_v4_avar,
                                NVPTX::STV_f32_v4_avar, std::nullopt);
       break;
     }
@@ -1976,15 +1891,13 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
                                NVPTX::STV_i8_v2_asi, NVPTX::STV_i16_v2_asi,
                                NVPTX::STV_i32_v2_asi, NVPTX::STV_i64_v2_asi,
-                               NVPTX::STV_f16_v2_asi, NVPTX::STV_f16x2_v2_asi,
                                NVPTX::STV_f32_v2_asi, NVPTX::STV_f64_v2_asi);
       break;
     case NVPTXISD::StoreV4:
-      Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
-                               NVPTX::STV_i8_v4_asi, NVPTX::STV_i16_v4_asi,
-                               NVPTX::STV_i32_v4_asi, std::nullopt,
-                               NVPTX::STV_f16_v4_asi, NVPTX::STV_f16x2_v4_asi,
-                               NVPTX::STV_f32_v4_asi, std::nullopt);
+      Opcode =
+          pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_asi,
+                          NVPTX::STV_i16_v4_asi, NVPTX::STV_i32_v4_asi,
+                          std::nullopt, NVPTX::STV_f32_v4_asi, std::nullopt);
       break;
     }
     StOps.push_back(Base);
@@ -1996,18 +1909,16 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
       default:
         return false;
       case NVPTXISD::StoreV2:
-        Opcode = pickOpcodeForVT(
-            EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_ari_64,
-            NVPTX::STV_i16_v2_ari_64, NVPTX::STV_i32_v2_ari_64,
-            NVPTX::STV_i64_v2_ari_64, NVPTX::STV_f16_v2_ari_64,
-            NVPTX::STV_f16x2_v2_ari_64, NVPTX::STV_f32_v2_ari_64,
-            NVPTX::STV_f64_v2_ari_64);
+        Opcode =
+            pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                            NVPTX::STV_i8_v2_ari_64, NVPTX::STV_i16_v2_ari_64,
+                            NVPTX::STV_i32_v2_ari_64, NVPTX::STV_i64_v2_ari_64,
+                            NVPTX::STV_f32_v2_ari_64, NVPTX::STV_f64_v2_ari_64);
         break;
       case NVPTXISD::StoreV4:
         Opcode = pickOpcodeForVT(
             EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_ari_64,
             NVPTX::STV_i16_v4_ari_64, NVPTX::STV_i32_v4_ari_64, std::nullopt,
-            NVPTX::STV_f16_v4_ari_64, NVPTX::STV_f16x2_v4_ari_64,
             NVPTX::STV_f32_v4_ari_64, std::nullopt);
         break;
       }
@@ -2019,14 +1930,12 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
                                  NVPTX::STV_i8_v2_ari, NVPTX::STV_i16_v2_ari,
                                  NVPTX::STV_i32_v2_ari, NVPTX::STV_i64_v2_ari,
-                                 NVPTX::STV_f16_v2_ari, NVPTX::STV_f16x2_v2_ari,
                                  NVPTX::STV_f32_v2_ari, NVPTX::STV_f64_v2_ari);
         break;
       case NVPTXISD::StoreV4:
         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
                                  NVPTX::STV_i8_v4_ari, NVPTX::STV_i16_v4_ari,
                                  NVPTX::STV_i32_v4_ari, std::nullopt,
-                                 NVPTX::STV_f16_v4_ari, NVPTX::STV_f16x2_v4_ari,
                                  NVPTX::STV_f32_v4_ari, std::nullopt);
         break;
       }
@@ -2042,15 +1951,13 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
         Opcode = pickOpcodeForVT(
             EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg_64,
             NVPTX::STV_i16_v2_areg_64, NVPTX::STV_i32_v2_areg_64,
-            NVPTX::STV_i64_v2_areg_64, NVPTX::STV_f16_v2_areg_64,
-            NVPTX::STV_f16x2_v2_areg_64, NVPTX::STV_f32_v2_areg_64,
+            NVPTX::STV_i64_v2_areg_64, NVPTX::STV_f32_v2_areg_64,
             NVPTX::STV_f64_v2_areg_64);
         break;
       case NVPTXISD::StoreV4:
         Opcode = pickOpcodeForVT(
             EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg_64,
             NVPTX::STV_i16_v4_areg_64, NVPTX::STV_i32_v4_areg_64, std::nullopt,
-            NVPTX::STV_f16_v4_areg_64, NVPTX::STV_f16x2_v4_areg_64,
             NVPTX::STV_f32_v4_areg_64, std::nullopt);
         break;
       }
@@ -2062,16 +1969,14 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
         Opcode =
             pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg,
                             NVPTX::STV_i16_v2_areg, NVPTX::STV_i32_v2_areg,
-                            NVPTX::STV_i64_v2_areg, NVPTX::STV_f16_v2_areg,
-                            NVPTX::STV_f16x2_v2_areg, NVPTX::STV_f32_v2_areg,
+                            NVPTX::STV_i64_v2_areg, NVPTX::STV_f32_v2_areg,
                             NVPTX::STV_f64_v2_areg);
         break;
       case NVPTXISD::StoreV4:
-        Opcode = pickOpcodeForVT(
-            EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg,
-            NVPTX::STV_i16_v4_areg, NVPTX::STV_i32_v4_areg, std::nullopt,
-            NVPTX::STV_f16_v4_areg, NVPTX::STV_f16x2_v4_areg,
-            NVPTX::STV_f32_v4_areg, std::nullopt);
+        Opcode =
+            pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg,
+                            NVPTX::STV_i16_v4_areg, NVPTX::STV_i32_v4_areg,
+                            std::nullopt, NVPTX::STV_f32_v4_areg, std::nullopt);
         break;
       }
     }
@@ -2126,23 +2031,20 @@ bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) {
     Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy,
                              NVPTX::LoadParamMemI8, NVPTX::LoadParamMemI16,
                              NVPTX::LoadParamMemI32, NVPTX::LoadParamMemI64,
-                             NVPTX::LoadParamMemF16, NVPTX::LoadParamMemF16x2,
                              NVPTX::LoadParamMemF32, NVPTX::LoadParamMemF64);
     break;
   case 2:
     Opcode =
         pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV2I8,
                         NVPTX::LoadParamMemV2I16, NVPTX::LoadParamMemV2I32,
-                        NVPTX::LoadParamMemV2I64, NVPTX::LoadParamMemV2F16,
-                        NVPTX::LoadParamMemV2F16x2, NVPTX::LoadParamMemV2F32,
+                        NVPTX::LoadParamMemV2I64, NVPTX::LoadParamMemV2F32,
                         NVPTX::LoadParamMemV2F64);
     break;
   case 4:
-    Opcode = pickOpcodeForVT(
-        MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV4I8,
-        NVPTX::LoadParamMemV4I16, NVPTX::LoadParamMemV4I32, std::nullopt,
-        NVPTX::LoadParamMemV4F16, NVPTX::LoadParamMemV4F16x2,
-        NVPTX::LoadParamMemV4F32, std::nullopt);
+    Opcode =
+        pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV4I8,
+                        NVPTX::LoadParamMemV4I16, NVPTX::LoadParamMemV4I32,
+                        std::nullopt, NVPTX::LoadParamMemV4F32, std::nullopt);
     break;
   }
   if (!Opcode)
@@ -2210,21 +2112,18 @@ bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {
     Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
                              NVPTX::StoreRetvalI8, NVPTX::StoreRetvalI16,
                              NVPTX::StoreRetvalI32, NVPTX::StoreRetvalI64,
-                             NVPTX::StoreRetvalF16, NVPTX::StoreRetvalF16x2,
                              NVPTX::StoreRetvalF32, NVPTX::StoreRetvalF64);
     break;
   case 2:
     Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
                              NVPTX::StoreRetvalV2I8, NVPTX::StoreRetvalV2I16,
                              NVPTX::StoreRetvalV2I32, NVPTX::StoreRetvalV2I64,
-                             NVPTX::StoreRetvalV2F16, NVPTX::StoreRetvalV2F16x2,
                              NVPTX::StoreRetvalV2F32, NVPTX::StoreRetvalV2F64);
     break;
   case 4:
     Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
                              NVPTX::StoreRetvalV4I8, NVPTX::StoreRetvalV4I16,
                              NVPTX::StoreRetvalV4I32, std::nullopt,
-                             NVPTX::StoreRetvalV4F16, NVPTX::StoreRetvalV4F16x2,
                              NVPTX::StoreRetvalV4F32, std::nullopt);
     break;
   }
@@ -2289,21 +2188,18 @@ bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
       Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
                                NVPTX::StoreParamI8, NVPTX::StoreParamI16,
                                NVPTX::StoreParamI32, NVPTX::StoreParamI64,
-                               NVPTX::StoreParamF16, NVPTX::StoreParamF16x2,
                                NVPTX::StoreParamF32, NVPTX::StoreParamF64);
       break;
     case 2:
       Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
                                NVPTX::StoreParamV2I8, NVPTX::StoreParamV2I16,
                                NVPTX::StoreParamV2I32, NVPTX::StoreParamV2I64,
-                               NVPTX::StoreParamV2F16, NVPTX::StoreParamV2F16x2,
                                NVPTX::StoreParamV2F32, NVPTX::StoreParamV2F64);
       break;
     case 4:
       Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
                                NVPTX::StoreParamV4I8, NVPTX::StoreParamV4I16,
                                NVPTX::StoreParamV4I32, std::nullopt,
-                               NVPTX::StoreParamV4F16, NVPTX::StoreParamV4F16x2,
                                NVPTX::StoreParamV4F32, std::nullopt);
       break;
     }

diff  --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 692ec58bdedff..c46ed2111258c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -410,10 +410,10 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
   addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
   addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
-  addRegisterClass(MVT::f16, &NVPTX::Float16RegsRegClass);
-  addRegisterClass(MVT::v2f16, &NVPTX::Float16x2RegsRegClass);
-  addRegisterClass(MVT::bf16, &NVPTX::Float16RegsRegClass);
-  addRegisterClass(MVT::v2bf16, &NVPTX::Float16x2RegsRegClass);
+  addRegisterClass(MVT::f16, &NVPTX::Int16RegsRegClass);
+  addRegisterClass(MVT::v2f16, &NVPTX::Int32RegsRegClass);
+  addRegisterClass(MVT::bf16, &NVPTX::Int16RegsRegClass);
+  addRegisterClass(MVT::v2bf16, &NVPTX::Int32RegsRegClass);
 
   // Conversion to/from FP16/FP16x2 is always legal.
   setOperationAction(ISD::SINT_TO_FP, MVT::f16, Legal);

diff  --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
index 8df6f13aa68e1..b0d792b5ee3fe 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
@@ -51,11 +51,6 @@ void NVPTXInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   } else if (DestRC == &NVPTX::Int64RegsRegClass) {
     Op = (SrcRC == &NVPTX::Int64RegsRegClass ? NVPTX::IMOV64rr
                                              : NVPTX::BITCONVERT_64_F2I);
-  } else if (DestRC == &NVPTX::Float16RegsRegClass) {
-    Op = (SrcRC == &NVPTX::Float16RegsRegClass ? NVPTX::FMOV16rr
-                                               : NVPTX::BITCONVERT_16_I2F);
-  } else if (DestRC == &NVPTX::Float16x2RegsRegClass) {
-    Op = NVPTX::IMOV32rr;
   } else if (DestRC == &NVPTX::Float32RegsRegClass) {
     Op = (SrcRC == &NVPTX::Float32RegsRegClass ? NVPTX::FMOV32rr
                                                : NVPTX::BITCONVERT_32_I2F);

diff  --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index a540b3d8364f1..43fd6da00828e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -164,10 +164,10 @@ class ValueToRegClass<ValueType T> {
      !eq(name, "i16"): Int16Regs,
      !eq(name, "i32"): Int32Regs,
      !eq(name, "i64"): Int64Regs,
-     !eq(name, "f16"): Float16Regs,
-     !eq(name, "v2f16"): Float16x2Regs,
-     !eq(name, "bf16"): Float16Regs,
-     !eq(name, "v2bf16"): Float16x2Regs,
+     !eq(name, "f16"): Int16Regs,
+     !eq(name, "v2f16"): Int32Regs,
+     !eq(name, "bf16"): Int16Regs,
+     !eq(name, "v2bf16"): Int32Regs,
      !eq(name, "f32"): Float32Regs,
      !eq(name, "f64"): Float64Regs,
      !eq(name, "ai32"): Int32ArgRegs,
@@ -280,29 +280,29 @@ multiclass F3<string OpcStr, SDNode OpNode> {
                [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>;
 
    def f16rr_ftz :
-     NVPTXInst<(outs Float16Regs:$dst),
-               (ins Float16Regs:$a, Float16Regs:$b),
+     NVPTXInst<(outs Int16Regs:$dst),
+               (ins Int16Regs:$a, Int16Regs:$b),
                !strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"),
-               [(set Float16Regs:$dst, (OpNode (f16 Float16Regs:$a), (f16 Float16Regs:$b)))]>,
+               [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>,
                Requires<[useFP16Math, doF32FTZ]>;
    def f16rr :
-     NVPTXInst<(outs Float16Regs:$dst),
-               (ins Float16Regs:$a, Float16Regs:$b),
+     NVPTXInst<(outs Int16Regs:$dst),
+               (ins Int16Regs:$a, Int16Regs:$b),
                !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"),
-               [(set Float16Regs:$dst, (OpNode (f16 Float16Regs:$a), (f16 Float16Regs:$b)))]>,
+               [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>,
                Requires<[useFP16Math]>;
 
    def f16x2rr_ftz :
-     NVPTXInst<(outs Float16x2Regs:$dst),
-               (ins Float16x2Regs:$a, Float16x2Regs:$b),
+     NVPTXInst<(outs Int32Regs:$dst),
+               (ins Int32Regs:$a, Int32Regs:$b),
                !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"),
-               [(set Float16x2Regs:$dst, (OpNode (v2f16 Float16x2Regs:$a), (v2f16 Float16x2Regs:$b)))]>,
+               [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
                Requires<[useFP16Math, doF32FTZ]>;
    def f16x2rr :
-     NVPTXInst<(outs Float16x2Regs:$dst),
-               (ins Float16x2Regs:$a, Float16x2Regs:$b),
+     NVPTXInst<(outs Int32Regs:$dst),
+               (ins Int32Regs:$a, Int32Regs:$b),
                !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"),
-               [(set Float16x2Regs:$dst, (OpNode (v2f16 Float16x2Regs:$a), (v2f16 Float16x2Regs:$b)))]>,
+               [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
                Requires<[useFP16Math]>;
 }
 
@@ -354,29 +354,29 @@ multiclass F3_fma_component<string OpcStr, SDNode OpNode> {
                Requires<[allowFMA]>;
 
    def f16rr_ftz :
-     NVPTXInst<(outs Float16Regs:$dst),
-               (ins Float16Regs:$a, Float16Regs:$b),
+     NVPTXInst<(outs Int16Regs:$dst),
+               (ins Int16Regs:$a, Int16Regs:$b),
                !strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"),
-               [(set Float16Regs:$dst, (OpNode (f16 Float16Regs:$a), (f16 Float16Regs:$b)))]>,
+               [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>,
                Requires<[useFP16Math, allowFMA, doF32FTZ]>;
    def f16rr :
-     NVPTXInst<(outs Float16Regs:$dst),
-               (ins Float16Regs:$a, Float16Regs:$b),
+     NVPTXInst<(outs Int16Regs:$dst),
+               (ins Int16Regs:$a, Int16Regs:$b),
                !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"),
-               [(set Float16Regs:$dst, (OpNode (f16 Float16Regs:$a), (f16 Float16Regs:$b)))]>,
+               [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>,
                Requires<[useFP16Math, allowFMA]>;
 
    def f16x2rr_ftz :
-     NVPTXInst<(outs Float16x2Regs:$dst),
-               (ins Float16x2Regs:$a, Float16x2Regs:$b),
+     NVPTXInst<(outs Int32Regs:$dst),
+               (ins Int32Regs:$a, Int32Regs:$b),
                !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"),
-               [(set (v2f16 Float16x2Regs:$dst), (OpNode (v2f16 Float16x2Regs:$a), (v2f16 Float16x2Regs:$b)))]>,
+               [(set (v2f16 Int32Regs:$dst), (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
                Requires<[useFP16Math, allowFMA, doF32FTZ]>;
    def f16x2rr :
-     NVPTXInst<(outs Float16x2Regs:$dst),
-               (ins Float16x2Regs:$a, Float16x2Regs:$b),
+     NVPTXInst<(outs Int32Regs:$dst),
+               (ins Int32Regs:$a, Int32Regs:$b),
                !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"),
-               [(set Float16x2Regs:$dst, (OpNode (v2f16 Float16x2Regs:$a), (v2f16 Float16x2Regs:$b)))]>,
+               [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
                Requires<[useFP16Math, allowFMA]>;
 
    // These have strange names so we don't perturb existing mir tests.
@@ -417,28 +417,28 @@ multiclass F3_fma_component<string OpcStr, SDNode OpNode> {
                [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
                Requires<[noFMA]>;
    def _rnf16rr_ftz :
-     NVPTXInst<(outs Float16Regs:$dst),
-               (ins Float16Regs:$a, Float16Regs:$b),
+     NVPTXInst<(outs Int16Regs:$dst),
+               (ins Int16Regs:$a, Int16Regs:$b),
                !strconcat(OpcStr, ".rn.ftz.f16 \t$dst, $a, $b;"),
-               [(set Float16Regs:$dst, (OpNode (f16 Float16Regs:$a), (f16 Float16Regs:$b)))]>,
+               [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>,
                Requires<[useFP16Math, noFMA, doF32FTZ]>;
    def _rnf16rr :
-     NVPTXInst<(outs Float16Regs:$dst),
-               (ins Float16Regs:$a, Float16Regs:$b),
+     NVPTXInst<(outs Int16Regs:$dst),
+               (ins Int16Regs:$a, Int16Regs:$b),
                !strconcat(OpcStr, ".rn.f16 \t$dst, $a, $b;"),
-               [(set Float16Regs:$dst, (OpNode (f16 Float16Regs:$a), (f16 Float16Regs:$b)))]>,
+               [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>,
                Requires<[useFP16Math, noFMA]>;
    def _rnf16x2rr_ftz :
-     NVPTXInst<(outs Float16x2Regs:$dst),
-               (ins Float16x2Regs:$a, Float16x2Regs:$b),
+     NVPTXInst<(outs Int32Regs:$dst),
+               (ins Int32Regs:$a, Int32Regs:$b),
                !strconcat(OpcStr, ".rn.ftz.f16x2 \t$dst, $a, $b;"),
-               [(set Float16x2Regs:$dst, (OpNode (v2f16 Float16x2Regs:$a), (v2f16 Float16x2Regs:$b)))]>,
+               [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
                Requires<[useFP16Math, noFMA, doF32FTZ]>;
    def _rnf16x2rr :
-     NVPTXInst<(outs Float16x2Regs:$dst),
-               (ins Float16x2Regs:$a, Float16x2Regs:$b),
+     NVPTXInst<(outs Int32Regs:$dst),
+               (ins Int32Regs:$a, Int32Regs:$b),
                !strconcat(OpcStr, ".rn.f16x2 \t$dst, $a, $b;"),
-               [(set Float16x2Regs:$dst, (OpNode (v2f16 Float16x2Regs:$a), (v2f16 Float16x2Regs:$b)))]>,
+               [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
                Requires<[useFP16Math, noFMA]>;
 }
 
@@ -513,7 +513,7 @@ let hasSideEffects = false in {
                 FromName, ".u64 \t$dst, $src;"), []>;
     def _f16 :
       NVPTXInst<(outs RC:$dst),
-                (ins Float16Regs:$src, CvtMode:$mode),
+                (ins Int16Regs:$src, CvtMode:$mode),
                 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
                 FromName, ".f16 \t$dst, $src;"), []>;
     def _f32 :
@@ -537,7 +537,7 @@ let hasSideEffects = false in {
   defm CVT_u32 : CVT_FROM_ALL<"u32", Int32Regs>;
   defm CVT_s64 : CVT_FROM_ALL<"s64", Int64Regs>;
   defm CVT_u64 : CVT_FROM_ALL<"u64", Int64Regs>;
-  defm CVT_f16 : CVT_FROM_ALL<"f16", Float16Regs>;
+  defm CVT_f16 : CVT_FROM_ALL<"f16", Int16Regs>;
   defm CVT_f32 : CVT_FROM_ALL<"f32", Float32Regs>;
   defm CVT_f64 : CVT_FROM_ALL<"f64", Float64Regs>;
 
@@ -576,7 +576,7 @@ multiclass CVT_FROM_FLOAT_SM80<string FromName, RegisterClass RC> {
     Requires<[hasPTX<70>, hasSM<80>]>;
   }
 
-  defm CVT_f16x2 : CVT_FROM_FLOAT_V2_SM80<"f16x2", Float16x2Regs>;
+  defm CVT_f16x2 : CVT_FROM_FLOAT_V2_SM80<"f16x2", Int32Regs>;
   defm CVT_bf16x2 : CVT_FROM_FLOAT_V2_SM80<"bf16x2", Int32Regs>;
 }
 
@@ -640,21 +640,21 @@ defm SELP_u32 : SELP<"u32", Int32Regs, i32imm>;
 defm SELP_b64 : SELP_PATTERN<"b64", i64, Int64Regs, i64imm, imm>;
 defm SELP_s64 : SELP<"s64", Int64Regs, i64imm>;
 defm SELP_u64 : SELP<"u64", Int64Regs, i64imm>;
-defm SELP_f16 : SELP_PATTERN<"b16", f16, Float16Regs, f16imm, fpimm>;
+defm SELP_f16 : SELP_PATTERN<"b16", f16, Int16Regs, f16imm, fpimm>;
 
 defm SELP_f32 : SELP_PATTERN<"f32", f32, Float32Regs, f32imm, fpimm>;
 defm SELP_f64 : SELP_PATTERN<"f64", f64, Float64Regs, f64imm, fpimm>;
 
 // This does not work as tablegen fails to infer the type of 'imm'.
 // def v2f16imm : Operand<v2f16>;
-// defm SELP_f16x2 : SELP_PATTERN<"b32", v2f16, Float16x2Regs, v2f16imm, imm>;
+// defm SELP_f16x2 : SELP_PATTERN<"b32", v2f16, Int32Regs, v2f16imm, imm>;
 
 def SELP_f16x2rr :
-    NVPTXInst<(outs Float16x2Regs:$dst),
-              (ins Float16x2Regs:$a, Float16x2Regs:$b, Int1Regs:$p),
+    NVPTXInst<(outs Int32Regs:$dst),
+              (ins Int32Regs:$a, Int32Regs:$b, Int1Regs:$p),
               "selp.b32 \t$dst, $a, $b, $p;",
-              [(set Float16x2Regs:$dst,
-                    (select Int1Regs:$p, (v2f16 Float16x2Regs:$a), (v2f16 Float16x2Regs:$b)))]>;
+              [(set Int32Regs:$dst,
+                    (select Int1Regs:$p, (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>;
 
 //-----------------------------------
 // Test Instructions
@@ -783,26 +783,26 @@ def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>;
 def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>;
 
 // Matchers for signed, unsigned mul.wide ISD nodes.
-def : Pat<(i32 (mul_wide_signed Int16Regs:$a, Int16Regs:$b)),
-          (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
+def : Pat<(i32 (mul_wide_signed i16:$a, i16:$b)),
+          (MULWIDES32 i16:$a, i16:$b)>,
       Requires<[doMulWide]>;
 def : Pat<(i32 (mul_wide_signed Int16Regs:$a, imm:$b)),
           (MULWIDES32Imm Int16Regs:$a, imm:$b)>,
       Requires<[doMulWide]>;
-def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, Int16Regs:$b)),
+def : Pat<(i32 (mul_wide_unsigned i16:$a, i16:$b)),
           (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
       Requires<[doMulWide]>;
 def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, imm:$b)),
           (MULWIDEU32Imm Int16Regs:$a, imm:$b)>,
       Requires<[doMulWide]>;
 
-def : Pat<(i64 (mul_wide_signed Int32Regs:$a, Int32Regs:$b)),
+def : Pat<(i64 (mul_wide_signed i32:$a, i32:$b)),
           (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
       Requires<[doMulWide]>;
 def : Pat<(i64 (mul_wide_signed Int32Regs:$a, imm:$b)),
           (MULWIDES64Imm Int32Regs:$a, imm:$b)>,
       Requires<[doMulWide]>;
-def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, Int32Regs:$b)),
+def : Pat<(i64 (mul_wide_unsigned i32:$a, i32:$b)),
           (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
       Requires<[doMulWide]>;
 def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, imm:$b)),
@@ -1003,7 +1003,7 @@ def DoubleConst1 : PatLeaf<(fpimm), [{
 // fp16 immediate values in .f16 instructions. Instead we have to load
 // the constant into a register using mov.b16.
 def LOAD_CONST_F16 :
-  NVPTXInst<(outs Float16Regs:$dst), (ins f16imm:$a),
+  NVPTXInst<(outs Int16Regs:$dst), (ins f16imm:$a),
             "mov.b16 \t$dst, $a;", []>;
 
 defm FADD : F3_fma_component<"add", fadd>;
@@ -1028,10 +1028,10 @@ class FNEG_F16_F16X2<string OpcStr, ValueType T, RegisterClass RC, Predicate Pre
                 !strconcat(OpcStr, " \t$dst, $src;"),
                 [(set RC:$dst, (fneg (T RC:$src)))]>,
                 Requires<[useFP16Math, hasPTX<60>, hasSM<53>, Pred]>;
-def FNEG16_ftz   : FNEG_F16_F16X2<"neg.ftz.f16", f16, Float16Regs, doF32FTZ>;
-def FNEG16       : FNEG_F16_F16X2<"neg.f16", f16, Float16Regs, True>;
-def FNEG16x2_ftz : FNEG_F16_F16X2<"neg.ftz.f16x2", v2f16, Float16x2Regs, doF32FTZ>;
-def FNEG16x2     : FNEG_F16_F16X2<"neg.f16x2", v2f16, Float16x2Regs, True>;
+def FNEG16_ftz   : FNEG_F16_F16X2<"neg.ftz.f16", f16, Int16Regs, doF32FTZ>;
+def FNEG16       : FNEG_F16_F16X2<"neg.f16", f16, Int16Regs, True>;
+def FNEG16x2_ftz : FNEG_F16_F16X2<"neg.ftz.f16x2", v2f16, Int32Regs, doF32FTZ>;
+def FNEG16x2     : FNEG_F16_F16X2<"neg.f16x2", v2f16, Int32Regs, True>;
 
 //
 // F64 division
@@ -1211,10 +1211,10 @@ multiclass FMA_F16<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred>
                        Requires<[useFP16Math, Pred]>;
 }
 
-defm FMA16_ftz : FMA_F16<"fma.rn.ftz.f16", f16, Float16Regs, doF32FTZ>;
-defm FMA16     : FMA_F16<"fma.rn.f16", f16, Float16Regs, True>;
-defm FMA16x2_ftz : FMA_F16<"fma.rn.ftz.f16x2", v2f16, Float16x2Regs, doF32FTZ>;
-defm FMA16x2     : FMA_F16<"fma.rn.f16x2", v2f16, Float16x2Regs, True>;
+defm FMA16_ftz : FMA_F16<"fma.rn.ftz.f16", f16, Int16Regs, doF32FTZ>;
+defm FMA16     : FMA_F16<"fma.rn.f16", f16, Int16Regs, True>;
+defm FMA16x2_ftz : FMA_F16<"fma.rn.ftz.f16x2", v2f16, Int32Regs, doF32FTZ>;
+defm FMA16x2     : FMA_F16<"fma.rn.f16x2", v2f16, Int32Regs, True>;
 defm FMA32_ftz : FMA<"fma.rn.ftz.f32", Float32Regs, f32imm, doF32FTZ>;
 defm FMA32     : FMA<"fma.rn.f32", Float32Regs, f32imm, True>;
 defm FMA64     : FMA<"fma.rn.f64", Float64Regs, f64imm, True>;
@@ -1651,13 +1651,13 @@ defm SETP_f32 : SETP<"f32", Float32Regs, f32imm>;
 defm SETP_f64 : SETP<"f64", Float64Regs, f64imm>;
 def SETP_f16rr :
       NVPTXInst<(outs Int1Regs:$dst),
-                (ins Float16Regs:$a, Float16Regs:$b, CmpMode:$cmp),
+                (ins Int16Regs:$a, Int16Regs:$b, CmpMode:$cmp),
                 "setp${cmp:base}${cmp:ftz}.f16 \t$dst, $a, $b;",
                 []>, Requires<[useFP16Math]>;
 
 def SETP_f16x2rr :
       NVPTXInst<(outs Int1Regs:$p, Int1Regs:$q),
-                (ins Float16x2Regs:$a, Float16x2Regs:$b, CmpMode:$cmp),
+                (ins Int32Regs:$a, Int32Regs:$b, CmpMode:$cmp),
                 "setp${cmp:base}${cmp:ftz}.f16x2 \t$p|$q, $a, $b;",
                 []>,
                 Requires<[useFP16Math]>;
@@ -1690,7 +1690,7 @@ defm SET_u32 : SET<"u32", Int32Regs, i32imm>;
 defm SET_b64 : SET<"b64", Int64Regs, i64imm>;
 defm SET_s64 : SET<"s64", Int64Regs, i64imm>;
 defm SET_u64 : SET<"u64", Int64Regs, i64imm>;
-defm SET_f16 : SET<"f16", Float16Regs, f16imm>;
+defm SET_f16 : SET<"f16", Int16Regs, f16imm>;
 defm SET_f32 : SET<"f32", Float32Regs, f32imm>;
 defm SET_f64 : SET<"f64", Float64Regs, f64imm>;
 
@@ -1760,7 +1760,14 @@ let IsSimpleMove=1, hasSideEffects=0 in {
   def IMOV64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss),
                            "mov.u64 \t$dst, $sss;", []>;
 
-  def FMOV16rr : NVPTXInst<(outs Float16Regs:$dst), (ins Float16Regs:$src),
+  def IMOVB16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss),
+                           "mov.b16 \t$dst, $sss;", []>;
+  def IMOVB32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss),
+                           "mov.b32 \t$dst, $sss;", []>;
+  def IMOVB64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss),
+                           "mov.b64 \t$dst, $sss;", []>;
+
+  def FMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
                            // We have to use .b16 here as there's no mov.f16.
                            "mov.b16 \t$dst, $src;", []>;
   def FMOV32rr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
@@ -1782,6 +1789,13 @@ def IMOV64ri : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src),
                         "mov.u64 \t$dst, $src;",
                         [(set Int64Regs:$dst, imm:$src)]>;
 
+def IMOVB16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src),
+                         "mov.b16 \t$dst, $src;", []>;
+def IMOVB32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src),
+                         "mov.b32 \t$dst, $src;", []>;
+def IMOVB64ri : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src),
+                        "mov.b64 \t$dst, $src;", []>;
+
 def FMOV32ri : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src),
                          "mov.f32 \t$dst, $src;",
                          [(set Float32Regs:$dst, fpimm:$src)]>;
@@ -1824,14 +1838,14 @@ multiclass ISET_FORMAT<PatFrag OpNode, PatLeaf Mode,
                        Instruction set_64ri,
                        Instruction set_64ir> {
   // i16 -> pred
-  def : Pat<(i1 (OpNode Int16Regs:$a, Int16Regs:$b)),
+  def : Pat<(i1 (OpNode i16:$a, i16:$b)),
             (setp_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
   def : Pat<(i1 (OpNode Int16Regs:$a, imm:$b)),
             (setp_16ri Int16Regs:$a, imm:$b, Mode)>;
   def : Pat<(i1 (OpNode imm:$a, Int16Regs:$b)),
             (setp_16ir imm:$a, Int16Regs:$b, Mode)>;
   // i32 -> pred
-  def : Pat<(i1 (OpNode Int32Regs:$a, Int32Regs:$b)),
+  def : Pat<(i1 (OpNode i32:$a, i32:$b)),
             (setp_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
   def : Pat<(i1 (OpNode Int32Regs:$a, imm:$b)),
             (setp_32ri Int32Regs:$a, imm:$b, Mode)>;
@@ -1846,14 +1860,14 @@ multiclass ISET_FORMAT<PatFrag OpNode, PatLeaf Mode,
             (setp_64ir imm:$a, Int64Regs:$b, Mode)>;
 
   // i16 -> i32
-  def : Pat<(i32 (OpNode Int16Regs:$a, Int16Regs:$b)),
+  def : Pat<(i32 (OpNode i16:$a, i16:$b)),
             (set_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
   def : Pat<(i32 (OpNode Int16Regs:$a, imm:$b)),
             (set_16ri Int16Regs:$a, imm:$b, Mode)>;
   def : Pat<(i32 (OpNode imm:$a, Int16Regs:$b)),
             (set_16ir imm:$a, Int16Regs:$b, Mode)>;
   // i32 -> i32
-  def : Pat<(i32 (OpNode Int32Regs:$a, Int32Regs:$b)),
+  def : Pat<(i32 (OpNode i32:$a, i32:$b)),
             (set_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
   def : Pat<(i32 (OpNode Int32Regs:$a, imm:$b)),
             (set_32ri Int32Regs:$a, imm:$b, Mode)>;
@@ -1926,23 +1940,23 @@ def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
 
 multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
   // f16 -> pred
-  def : Pat<(i1 (OpNode (f16 Float16Regs:$a), (f16 Float16Regs:$b))),
-            (SETP_f16rr Float16Regs:$a, Float16Regs:$b, ModeFTZ)>,
+  def : Pat<(i1 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))),
+            (SETP_f16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>,
         Requires<[useFP16Math,doF32FTZ]>;
-  def : Pat<(i1 (OpNode (f16 Float16Regs:$a), (f16 Float16Regs:$b))),
-            (SETP_f16rr Float16Regs:$a, Float16Regs:$b, Mode)>,
+  def : Pat<(i1 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))),
+            (SETP_f16rr Int16Regs:$a, Int16Regs:$b, Mode)>,
         Requires<[useFP16Math]>;
-  def : Pat<(i1 (OpNode (f16 Float16Regs:$a), fpimm:$b)),
-            (SETP_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>,
+  def : Pat<(i1 (OpNode (f16 Int16Regs:$a), fpimm:$b)),
+            (SETP_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>,
         Requires<[useFP16Math,doF32FTZ]>;
-  def : Pat<(i1 (OpNode (f16 Float16Regs:$a), fpimm:$b)),
-            (SETP_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>,
+  def : Pat<(i1 (OpNode (f16 Int16Regs:$a), fpimm:$b)),
+            (SETP_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>,
         Requires<[useFP16Math]>;
-  def : Pat<(i1 (OpNode fpimm:$a, (f16 Float16Regs:$b))),
-            (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, ModeFTZ)>,
+  def : Pat<(i1 (OpNode fpimm:$a, (f16 Int16Regs:$b))),
+            (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, ModeFTZ)>,
         Requires<[useFP16Math,doF32FTZ]>;
-  def : Pat<(i1 (OpNode fpimm:$a, (f16 Float16Regs:$b))),
-            (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, Mode)>,
+  def : Pat<(i1 (OpNode fpimm:$a, (f16 Int16Regs:$b))),
+            (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, Mode)>,
         Requires<[useFP16Math]>;
 
   // f32 -> pred
@@ -1971,23 +1985,23 @@ multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
             (SETP_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
 
   // f16 -> i32
-  def : Pat<(i32 (OpNode (f16 Float16Regs:$a), (f16 Float16Regs:$b))),
-            (SET_f16rr Float16Regs:$a, Float16Regs:$b, ModeFTZ)>,
+  def : Pat<(i32 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))),
+            (SET_f16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>,
         Requires<[useFP16Math, doF32FTZ]>;
-  def : Pat<(i32 (OpNode (f16 Float16Regs:$a), (f16 Float16Regs:$b))),
-            (SET_f16rr Float16Regs:$a, Float16Regs:$b, Mode)>,
+  def : Pat<(i32 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))),
+            (SET_f16rr Int16Regs:$a, Int16Regs:$b, Mode)>,
         Requires<[useFP16Math]>;
-  def : Pat<(i32 (OpNode (f16 Float16Regs:$a), fpimm:$b)),
-            (SET_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>,
+  def : Pat<(i32 (OpNode (f16 Int16Regs:$a), fpimm:$b)),
+            (SET_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>,
         Requires<[useFP16Math, doF32FTZ]>;
-  def : Pat<(i32 (OpNode (f16 Float16Regs:$a), fpimm:$b)),
-            (SET_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>,
+  def : Pat<(i32 (OpNode (f16 Int16Regs:$a), fpimm:$b)),
+            (SET_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>,
         Requires<[useFP16Math]>;
-  def : Pat<(i32 (OpNode fpimm:$a, (f16 Float16Regs:$b))),
-            (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, ModeFTZ)>,
+  def : Pat<(i32 (OpNode fpimm:$a, (f16 Int16Regs:$b))),
+            (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, ModeFTZ)>,
         Requires<[useFP16Math, doF32FTZ]>;
-  def : Pat<(i32 (OpNode fpimm:$a, (f16 Float16Regs:$b))),
-            (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, Mode)>,
+  def : Pat<(i32 (OpNode fpimm:$a, (f16 Int16Regs:$b))),
+            (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, Mode)>,
         Requires<[useFP16Math]>;
 
   // f32 -> i32
@@ -2276,16 +2290,10 @@ def LoadParamMemV2I8   : LoadParamV2MemInst<Int16Regs, ".b8">;
 def LoadParamMemV4I32  : LoadParamV4MemInst<Int32Regs, ".b32">;
 def LoadParamMemV4I16  : LoadParamV4MemInst<Int16Regs, ".b16">;
 def LoadParamMemV4I8   : LoadParamV4MemInst<Int16Regs, ".b8">;
-def LoadParamMemF16    : LoadParamMemInst<Float16Regs, ".b16">;
-def LoadParamMemF16x2  : LoadParamMemInst<Float16x2Regs, ".b32">;
 def LoadParamMemF32    : LoadParamMemInst<Float32Regs, ".f32">;
 def LoadParamMemF64    : LoadParamMemInst<Float64Regs, ".f64">;
-def LoadParamMemV2F16  : LoadParamV2MemInst<Float16Regs, ".b16">;
-def LoadParamMemV2F16x2: LoadParamV2MemInst<Float16x2Regs, ".b32">;
 def LoadParamMemV2F32  : LoadParamV2MemInst<Float32Regs, ".f32">;
 def LoadParamMemV2F64  : LoadParamV2MemInst<Float64Regs, ".f64">;
-def LoadParamMemV4F16  : LoadParamV4MemInst<Float16Regs, ".b16">;
-def LoadParamMemV4F16x2: LoadParamV4MemInst<Float16x2Regs, ".b32">;
 def LoadParamMemV4F32  : LoadParamV4MemInst<Float32Regs, ".f32">;
 
 def StoreParamI64    : StoreParamInst<Int64Regs, ".b64">;
@@ -2302,16 +2310,10 @@ def StoreParamV4I32  : StoreParamV4Inst<Int32Regs, ".b32">;
 def StoreParamV4I16  : StoreParamV4Inst<Int16Regs, ".b16">;
 def StoreParamV4I8   : StoreParamV4Inst<Int16Regs, ".b8">;
 
-def StoreParamF16      : StoreParamInst<Float16Regs, ".b16">;
-def StoreParamF16x2    : StoreParamInst<Float16x2Regs, ".b32">;
 def StoreParamF32      : StoreParamInst<Float32Regs, ".f32">;
 def StoreParamF64      : StoreParamInst<Float64Regs, ".f64">;
-def StoreParamV2F16    : StoreParamV2Inst<Float16Regs, ".b16">;
-def StoreParamV2F16x2  : StoreParamV2Inst<Float16x2Regs, ".b32">;
 def StoreParamV2F32    : StoreParamV2Inst<Float32Regs, ".f32">;
 def StoreParamV2F64    : StoreParamV2Inst<Float64Regs, ".f64">;
-def StoreParamV4F16    : StoreParamV4Inst<Float16Regs, ".b16">;
-def StoreParamV4F16x2  : StoreParamV4Inst<Float16x2Regs, ".b32">;
 def StoreParamV4F32    : StoreParamV4Inst<Float32Regs, ".f32">;
 
 def StoreRetvalI64    : StoreRetvalInst<Int64Regs, ".b64">;
@@ -2328,15 +2330,9 @@ def StoreRetvalV4I8   : StoreRetvalV4Inst<Int16Regs, ".b8">;
 
 def StoreRetvalF64    : StoreRetvalInst<Float64Regs, ".f64">;
 def StoreRetvalF32    : StoreRetvalInst<Float32Regs, ".f32">;
-def StoreRetvalF16    : StoreRetvalInst<Float16Regs, ".b16">;
-def StoreRetvalF16x2  : StoreRetvalInst<Float16x2Regs, ".b32">;
 def StoreRetvalV2F64  : StoreRetvalV2Inst<Float64Regs, ".f64">;
 def StoreRetvalV2F32  : StoreRetvalV2Inst<Float32Regs, ".f32">;
-def StoreRetvalV2F16  : StoreRetvalV2Inst<Float16Regs, ".b16">;
-def StoreRetvalV2F16x2: StoreRetvalV2Inst<Float16x2Regs, ".b32">;
 def StoreRetvalV4F32  : StoreRetvalV4Inst<Float32Regs, ".f32">;
-def StoreRetvalV4F16  : StoreRetvalV4Inst<Float16Regs, ".b16">;
-def StoreRetvalV4F16x2: StoreRetvalV4Inst<Float16x2Regs, ".b32">;
 
 def CallArgBeginInst : NVPTXInst<(outs), (ins), "(", [(CallArgBegin)]>;
 def CallArgEndInst1  : NVPTXInst<(outs), (ins), ");", [(CallArgEnd (i32 1))]>;
@@ -2347,19 +2343,26 @@ class CallArgInst<NVPTXRegClass regclass> :
   NVPTXInst<(outs), (ins regclass:$a), "$a, ",
             [(CallArg (i32 0), regclass:$a)]>;
 
+class CallArgInstVT<NVPTXRegClass regclass, ValueType vt> :
+  NVPTXInst<(outs), (ins regclass:$a), "$a, ",
+            [(CallArg (i32 0), vt:$a)]>;
+
 class LastCallArgInst<NVPTXRegClass regclass> :
   NVPTXInst<(outs), (ins regclass:$a), "$a",
             [(LastCallArg (i32 0), regclass:$a)]>;
+class LastCallArgInstVT<NVPTXRegClass regclass, ValueType vt> :
+  NVPTXInst<(outs), (ins regclass:$a), "$a",
+            [(LastCallArg (i32 0), vt:$a)]>;
 
 def CallArgI64     : CallArgInst<Int64Regs>;
-def CallArgI32     : CallArgInst<Int32Regs>;
-def CallArgI16     : CallArgInst<Int16Regs>;
+def CallArgI32     : CallArgInstVT<Int32Regs, i32>;
+def CallArgI16     : CallArgInstVT<Int16Regs, i16>;
 def CallArgF64     : CallArgInst<Float64Regs>;
 def CallArgF32     : CallArgInst<Float32Regs>;
 
 def LastCallArgI64 : LastCallArgInst<Int64Regs>;
-def LastCallArgI32 : LastCallArgInst<Int32Regs>;
-def LastCallArgI16 : LastCallArgInst<Int16Regs>;
+def LastCallArgI32 : LastCallArgInstVT<Int32Regs, i32>;
+def LastCallArgI16 : LastCallArgInstVT<Int16Regs, i16>;
 def LastCallArgF64 : LastCallArgInst<Float64Regs>;
 def LastCallArgF32 : LastCallArgInst<Float32Regs>;
 
@@ -2376,7 +2379,7 @@ def LastCallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a",
 def CallVoidInst :      NVPTXInst<(outs), (ins imem:$addr), "$addr, ",
                                   [(CallVoid (Wrapper tglobaladdr:$addr))]>;
 def CallVoidInstReg :   NVPTXInst<(outs), (ins Int32Regs:$addr), "$addr, ",
-                                  [(CallVoid Int32Regs:$addr)]>;
+                                  [(CallVoid i32:$addr)]>;
 def CallVoidInstReg64 : NVPTXInst<(outs), (ins Int64Regs:$addr), "$addr, ",
                                   [(CallVoid Int64Regs:$addr)]>;
 def PrototypeInst :     NVPTXInst<(outs), (ins i32imm:$val), ", prototype_$val;",
@@ -2413,53 +2416,54 @@ class MoveParamInst<ValueType T, NVPTXRegClass regclass, string asmstr> :
             !strconcat("mov", asmstr, " \t$dst, $src;"),
             [(set (T regclass:$dst), (MoveParam (T regclass:$src)))]>;
 
-class MoveParamSymbolInst<NVPTXRegClass regclass, Operand srcty,
+class MoveParamSymbolInst<NVPTXRegClass regclass, Operand srcty, ValueType vt,
                           string asmstr> :
   NVPTXInst<(outs regclass:$dst), (ins srcty:$src),
             !strconcat("mov", asmstr, " \t$dst, $src;"),
-            [(set regclass:$dst, (MoveParam texternalsym:$src))]>;
+            [(set vt:$dst, (MoveParam texternalsym:$src))]>;
 
 def MoveParamI64 : MoveParamInst<i64, Int64Regs, ".b64">;
 def MoveParamI32 : MoveParamInst<i32, Int32Regs, ".b32">;
 
-def MoveParamSymbolI64 : MoveParamSymbolInst<Int64Regs, i64imm, ".b64">;
-def MoveParamSymbolI32 : MoveParamSymbolInst<Int32Regs, i32imm, ".b32">;
+def MoveParamSymbolI64 : MoveParamSymbolInst<Int64Regs, i64imm, i64, ".b64">;
+def MoveParamSymbolI32 : MoveParamSymbolInst<Int32Regs, i32imm, i32, ".b32">;
 
 def MoveParamI16 :
   NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
-            "cvt.u16.u32 \t$dst, $src;",
-            [(set Int16Regs:$dst, (MoveParam Int16Regs:$src))]>;
+            "cvt.u16.u32 \t$dst, $src;", // ??? Why cvt.u16.u32 ? 
+            [(set i16:$dst, (MoveParam i16:$src))]>;
 def MoveParamF64 : MoveParamInst<f64, Float64Regs, ".f64">;
 def MoveParamF32 : MoveParamInst<f32, Float32Regs, ".f32">;
-def MoveParamF16 : MoveParamInst<f16, Float16Regs, ".f16">;
 
-class PseudoUseParamInst<NVPTXRegClass regclass> :
+class PseudoUseParamInst<NVPTXRegClass regclass, ValueType vt> :
   NVPTXInst<(outs), (ins regclass:$src),
             "// Pseudo use of $src",
-            [(PseudoUseParam regclass:$src)]>;
+            [(PseudoUseParam vt:$src)]>;
 
-def PseudoUseParamI64 : PseudoUseParamInst<Int64Regs>;
-def PseudoUseParamI32 : PseudoUseParamInst<Int32Regs>;
-def PseudoUseParamI16 : PseudoUseParamInst<Int16Regs>;
-def PseudoUseParamF64 : PseudoUseParamInst<Float64Regs>;
-def PseudoUseParamF32 : PseudoUseParamInst<Float32Regs>;
+def PseudoUseParamI64 : PseudoUseParamInst<Int64Regs, i64>;
+def PseudoUseParamI32 : PseudoUseParamInst<Int32Regs, i32>;
+def PseudoUseParamI16 : PseudoUseParamInst<Int16Regs, i16>;
+def PseudoUseParamF64 : PseudoUseParamInst<Float64Regs, f64>;
+def PseudoUseParamF32 : PseudoUseParamInst<Float32Regs, f32>;
 
 class ProxyRegInst<string SzStr, ValueType T, NVPTXRegClass regclass> :
   NVPTXInst<(outs regclass:$dst), (ins regclass:$src),
             !strconcat("mov.", SzStr, " \t$dst, $src;"),
             [(set (T regclass:$dst), (ProxyReg (T regclass:$src)))]>;
 
-let isCodeGenOnly=1, isPseudo=1 in {
-  def ProxyRegI1    : ProxyRegInst<"pred", i1, Int1Regs>;
-  def ProxyRegI16   : ProxyRegInst<"b16",  i16, Int16Regs>;
-  def ProxyRegI32   : ProxyRegInst<"b32",  i32, Int32Regs>;
-  def ProxyRegI64   : ProxyRegInst<"b64",  i64, Int64Regs>;
-  def ProxyRegF16   : ProxyRegInst<"b16",  f16, Float16Regs>;
-  def ProxyRegBF16  : ProxyRegInst<"b16",  bf16, Float16Regs>;
-  def ProxyRegF32   : ProxyRegInst<"f32",  f32, Float32Regs>;
-  def ProxyRegF64   : ProxyRegInst<"f64",  f64, Float64Regs>;
-  def ProxyRegF16x2 : ProxyRegInst<"b32",  v2f16, Float16x2Regs>;
-  def ProxyRegBF16x2 : ProxyRegInst<"b32",  v2bf16, Float16x2Regs>;
+def ProxyRegI1    : ProxyRegInst<"pred", i1, Int1Regs>;
+def ProxyRegI16   : ProxyRegInst<"b16",  i16, Int16Regs>;
+def ProxyRegI32   : ProxyRegInst<"b32",  i32, Int32Regs>;
+def ProxyRegI64   : ProxyRegInst<"b64",  i64, Int64Regs>;
+def ProxyRegF32   : ProxyRegInst<"f32",  f32, Float32Regs>;
+def ProxyRegF64   : ProxyRegInst<"f64",  f64, Float64Regs>;
+
+foreach vt = [f16, bf16] in {
+  def: Pat<(vt (ProxyReg  vt:$src)), (ProxyRegI16 Int16Regs:$src)>;
+}
+
+foreach vt = [v2f16, v2bf16] in {
+  def: Pat<(vt (ProxyReg  vt:$src)), (ProxyRegI32 Int32Regs:$src)>;
 }
 
 //
@@ -2509,8 +2513,6 @@ let mayLoad=1, hasSideEffects=0 in {
   defm LD_i16 : LD<Int16Regs>;
   defm LD_i32 : LD<Int32Regs>;
   defm LD_i64 : LD<Int64Regs>;
-  defm LD_f16 : LD<Float16Regs>;
-  defm LD_f16x2 : LD<Float16x2Regs>;
   defm LD_f32 : LD<Float32Regs>;
   defm LD_f64 : LD<Float64Regs>;
 }
@@ -2559,8 +2561,6 @@ let mayStore=1, hasSideEffects=0 in {
   defm ST_i16 : ST<Int16Regs>;
   defm ST_i32 : ST<Int32Regs>;
   defm ST_i64 : ST<Int64Regs>;
-  defm ST_f16 : ST<Float16Regs>;
-  defm ST_f16x2 : ST<Float16x2Regs>;
   defm ST_f32 : ST<Float32Regs>;
   defm ST_f64 : ST<Float64Regs>;
 }
@@ -2647,8 +2647,8 @@ let mayLoad=1, hasSideEffects=0 in {
   defm LDV_i16 : LD_VEC<Int16Regs>;
   defm LDV_i32 : LD_VEC<Int32Regs>;
   defm LDV_i64 : LD_VEC<Int64Regs>;
-  defm LDV_f16 : LD_VEC<Float16Regs>;
-  defm LDV_f16x2 : LD_VEC<Float16x2Regs>;
+  defm LDV_f16 : LD_VEC<Int16Regs>;
+  defm LDV_f16x2 : LD_VEC<Int32Regs>;
   defm LDV_f32 : LD_VEC<Float32Regs>;
   defm LDV_f64 : LD_VEC<Float64Regs>;
 }
@@ -2742,8 +2742,8 @@ let mayStore=1, hasSideEffects=0 in {
   defm STV_i16 : ST_VEC<Int16Regs>;
   defm STV_i32 : ST_VEC<Int32Regs>;
   defm STV_i64 : ST_VEC<Int64Regs>;
-  defm STV_f16 : ST_VEC<Float16Regs>;
-  defm STV_f16x2 : ST_VEC<Float16x2Regs>;
+  defm STV_f16 : ST_VEC<Int16Regs>;
+  defm STV_f16x2 : ST_VEC<Int32Regs>;
   defm STV_f32 : ST_VEC<Float32Regs>;
   defm STV_f64 : ST_VEC<Float64Regs>;
 }
@@ -2757,23 +2757,30 @@ class F_BITCONVERT<string SzStr, ValueType TIn, ValueType TOut,
            !strconcat("mov.b", SzStr, " \t$d, $a;"),
      [(set (TOut regclassOut:$d), (bitconvert (TIn regclassIn:$a)))]>;
 
-def BITCONVERT_16_I2F : F_BITCONVERT<"16", i16, f16>;
-def BITCONVERT_16_F2I : F_BITCONVERT<"16", f16, i16>;
-def BITCONVERT_16_I2BF : F_BITCONVERT<"16", i16, bf16>;
-def BITCONVERT_16_BF2I : F_BITCONVERT<"16", bf16, i16>;
 def BITCONVERT_32_I2F : F_BITCONVERT<"32", i32, f32>;
 def BITCONVERT_32_F2I : F_BITCONVERT<"32", f32, i32>;
 def BITCONVERT_64_I2F : F_BITCONVERT<"64", i64, f64>;
 def BITCONVERT_64_F2I : F_BITCONVERT<"64", f64, i64>;
-def BITCONVERT_32_I2F16x2 : F_BITCONVERT<"32", i32, v2f16>;
-def BITCONVERT_32_F16x22I : F_BITCONVERT<"32", v2f16, i32>;
-def BITCONVERT_32_F2F16x2 : F_BITCONVERT<"32", f32, v2f16>;
-def BITCONVERT_32_F16x22F : F_BITCONVERT<"32", v2f16, f32>;
-def BITCONVERT_32_I2BF16x2 : F_BITCONVERT<"32", i32, v2bf16>;
-def BITCONVERT_32_BF16x22I : F_BITCONVERT<"32", v2bf16, i32>;
-def BITCONVERT_32_F2BF16x2 : F_BITCONVERT<"32", f32, v2bf16>;
-def BITCONVERT_32_BF16x22F : F_BITCONVERT<"32", v2bf16, f32>;
 
+foreach vt = [v2f16, v2bf16] in {
+def: Pat<(vt (bitconvert (i32 UInt32Const:$a))),
+         (IMOVB32ri UInt32Const:$a)>;
+def: Pat<(vt (bitconvert (i32 Int32Regs:$a))),
+         (ProxyRegI32 Int32Regs:$a)>;
+def: Pat<(i32 (bitconvert (vt Int32Regs:$a))),
+         (ProxyRegI32 Int32Regs:$a)>;
+def: Pat<(vt (bitconvert (f32 Float32Regs:$a))),
+         (BITCONVERT_32_F2I Float32Regs:$a)>;
+}
+foreach vt = [f16, bf16] in {
+def: Pat<(vt (bitconvert (i16 UInt16Const:$a))),
+         (IMOVB16ri UInt16Const:$a)>;
+def: Pat<(vt (bitconvert (i16 Int16Regs:$a))),
+         (ProxyRegI16 Int16Regs:$a)>;
+def: Pat<(i16 (bitconvert (vt Int16Regs:$a))),
+         (ProxyRegI16 Int16Regs:$a)>;
+}
+        
 // NOTE: pred->fp are currently sub-optimal due to an issue in TableGen where
 // we cannot specify floating-point literals in isel patterns.  Therefore, we
 // use an integer selp to select either 1 or 0 and then cvt to floating-point.
@@ -2840,24 +2847,24 @@ def : Pat<(f64 (uint_to_fp Int64Regs:$a)),
 
 
 // f16 -> sint
-def : Pat<(i1 (fp_to_sint (f16 Float16Regs:$a))),
-          (SETP_b16ri (BITCONVERT_16_F2I Float16Regs:$a), 0, CmpEQ)>;
-def : Pat<(i16 (fp_to_sint (f16 Float16Regs:$a))),
-          (CVT_s16_f16 (f16 Float16Regs:$a), CvtRZI)>;
-def : Pat<(i32 (fp_to_sint (f16 Float16Regs:$a))),
-          (CVT_s32_f16 (f16 Float16Regs:$a), CvtRZI)>;
-def : Pat<(i64 (fp_to_sint (f16 Float16Regs:$a))),
-          (CVT_s64_f16 Float16Regs:$a, CvtRZI)>;
+def : Pat<(i1 (fp_to_sint (f16 Int16Regs:$a))),
+          (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>;
+def : Pat<(i16 (fp_to_sint (f16 Int16Regs:$a))),
+          (CVT_s16_f16 (f16 Int16Regs:$a), CvtRZI)>;
+def : Pat<(i32 (fp_to_sint (f16 Int16Regs:$a))),
+          (CVT_s32_f16 (f16 Int16Regs:$a), CvtRZI)>;
+def : Pat<(i64 (fp_to_sint (f16 Int16Regs:$a))),
+          (CVT_s64_f16 Int16Regs:$a, CvtRZI)>;
 
 // f16 -> uint
-def : Pat<(i1 (fp_to_uint (f16 Float16Regs:$a))),
-          (SETP_b16ri (BITCONVERT_16_F2I Float16Regs:$a), 0, CmpEQ)>;
-def : Pat<(i16 (fp_to_uint (f16 Float16Regs:$a))),
-          (CVT_u16_f16 Float16Regs:$a, CvtRZI)>;
-def : Pat<(i32 (fp_to_uint (f16 Float16Regs:$a))),
-          (CVT_u32_f16 Float16Regs:$a, CvtRZI)>;
-def : Pat<(i64 (fp_to_uint (f16 Float16Regs:$a))),
-          (CVT_u64_f16 Float16Regs:$a, CvtRZI)>;
+def : Pat<(i1 (fp_to_uint (f16 Int16Regs:$a))),
+          (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>;
+def : Pat<(i16 (fp_to_uint (f16 Int16Regs:$a))),
+          (CVT_u16_f16 Int16Regs:$a, CvtRZI)>;
+def : Pat<(i32 (fp_to_uint (f16 Int16Regs:$a))),
+          (CVT_u32_f16 Int16Regs:$a, CvtRZI)>;
+def : Pat<(i64 (fp_to_uint (f16 Int16Regs:$a))),
+          (CVT_u64_f16 Int16Regs:$a, CvtRZI)>;
 
 // f32 -> sint
 def : Pat<(i1 (fp_to_sint Float32Regs:$a)),
@@ -2994,17 +3001,17 @@ def : Pat<(sext_inreg Int64Regs:$a, i32), (CVT_INREG_s64_s32 Int64Regs:$a)>;
 
 
 // Select instructions with 32-bit predicates
-def : Pat<(select Int32Regs:$pred, Int16Regs:$a, Int16Regs:$b),
+def : Pat<(select Int32Regs:$pred, i16:$a, i16:$b),
           (SELP_b16rr Int16Regs:$a, Int16Regs:$b,
           (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
-def : Pat<(select Int32Regs:$pred, Int32Regs:$a, Int32Regs:$b),
+def : Pat<(select Int32Regs:$pred, i32:$a, i32:$b),
           (SELP_b32rr Int32Regs:$a, Int32Regs:$b,
           (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
 def : Pat<(select Int32Regs:$pred, Int64Regs:$a, Int64Regs:$b),
           (SELP_b64rr Int64Regs:$a, Int64Regs:$b,
           (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
-def : Pat<(select Int32Regs:$pred, (f16 Float16Regs:$a), (f16 Float16Regs:$b)),
-          (SELP_f16rr Float16Regs:$a, Float16Regs:$b,
+def : Pat<(select Int32Regs:$pred, (f16 Int16Regs:$a), (f16 Int16Regs:$b)),
+          (SELP_f16rr Int16Regs:$a, Int16Regs:$b,
           (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
 def : Pat<(select Int32Regs:$pred, Float32Regs:$a, Float32Regs:$b),
           (SELP_f32rr Float32Regs:$a, Float32Regs:$b,
@@ -3070,39 +3077,39 @@ let hasSideEffects = false in {
   // Extract element of f16x2 register. PTX does not provide any way
   // to access elements of f16x2 vector directly, so we need to
   // extract it using a temporary register.
-  def F16x2toF16_0 : NVPTXInst<(outs Float16Regs:$dst),
-                               (ins Float16x2Regs:$src),
+  def F16x2toF16_0 : NVPTXInst<(outs Int16Regs:$dst),
+                               (ins Int32Regs:$src),
                                "{{ .reg .b16 \t%tmp_hi;\n\t"
                                "  mov.b32 \t{$dst, %tmp_hi}, $src; }}",
-                               [(set Float16Regs:$dst,
-                                 (extractelt (v2f16 Float16x2Regs:$src), 0))]>;
-  def F16x2toF16_1 : NVPTXInst<(outs Float16Regs:$dst),
-                               (ins Float16x2Regs:$src),
+                               [(set Int16Regs:$dst,
+                                 (extractelt (v2f16 Int32Regs:$src), 0))]>;
+  def F16x2toF16_1 : NVPTXInst<(outs Int16Regs:$dst),
+                               (ins Int32Regs:$src),
                                "{{ .reg .b16 \t%tmp_lo;\n\t"
                                "  mov.b32 \t{%tmp_lo, $dst}, $src; }}",
-                               [(set Float16Regs:$dst,
-                                 (extractelt (v2f16 Float16x2Regs:$src), 1))]>;
+                               [(set Int16Regs:$dst,
+                                 (extractelt (v2f16 Int32Regs:$src), 1))]>;
 
   // Coalesce two f16 registers into f16x2
-  def BuildF16x2 : NVPTXInst<(outs Float16x2Regs:$dst),
-                             (ins Float16Regs:$a, Float16Regs:$b),
+  def BuildF16x2 : NVPTXInst<(outs Int32Regs:$dst),
+                             (ins Int16Regs:$a, Int16Regs:$b),
                              "mov.b32 \t$dst, {{$a, $b}};",
-                             [(set (v2f16 Float16x2Regs:$dst),
-                               (build_vector (f16 Float16Regs:$a), (f16 Float16Regs:$b)))]>;
+                             [(set (v2f16 Int32Regs:$dst),
+                               (build_vector (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>;
 
   // Directly initializing underlying the b32 register is one less SASS
   // instruction than than vector-packing move.
-  def BuildF16x2i : NVPTXInst<(outs Float16x2Regs:$dst), (ins i32imm:$src),
+  def BuildF16x2i : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src),
                               "mov.b32 \t$dst, $src;",
                               []>;
 
   // Split f16x2 into two f16 registers.
-  def SplitF16x2  : NVPTXInst<(outs Float16Regs:$lo, Float16Regs:$hi),
-                              (ins Float16x2Regs:$src),
+  def SplitF16x2  : NVPTXInst<(outs Int16Regs:$lo, Int16Regs:$hi),
+                              (ins Int32Regs:$src),
                               "mov.b32 \t{{$lo, $hi}}, $src;",
                               []>;
   // Split an i32 into two f16
-  def SplitI32toF16x2  : NVPTXInst<(outs Float16Regs:$lo, Float16Regs:$hi),
+  def SplitI32toF16x2  : NVPTXInst<(outs Int16Regs:$lo, Int16Regs:$hi),
                                    (ins Int32Regs:$src),
                                    "mov.b32 \t{{$lo, $hi}}, $src;",
                                    []>;
@@ -3186,14 +3193,14 @@ def : Pat<(f32 (fpround Float64Regs:$a)),
           (CVT_f32_f64 Float64Regs:$a, CvtRN)>;
 
 // fpextend f16 -> f32
-def : Pat<(f32 (fpextend (f16 Float16Regs:$a))),
-          (CVT_f32_f16 Float16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(f32 (fpextend (f16 Float16Regs:$a))),
-          (CVT_f32_f16 Float16Regs:$a, CvtNONE)>;
+def : Pat<(f32 (fpextend (f16 Int16Regs:$a))),
+          (CVT_f32_f16 Int16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(f32 (fpextend (f16 Int16Regs:$a))),
+          (CVT_f32_f16 Int16Regs:$a, CvtNONE)>;
 
 // fpextend f16 -> f64
-def : Pat<(f64 (fpextend (f16 Float16Regs:$a))),
-          (CVT_f64_f16 Float16Regs:$a, CvtNONE)>;
+def : Pat<(f64 (fpextend (f16 Int16Regs:$a))),
+          (CVT_f64_f16 Int16Regs:$a, CvtNONE)>;
 
 // fpextend f32 -> f64
 def : Pat<(f64 (fpextend Float32Regs:$a)),
@@ -3207,8 +3214,8 @@ def retglue : SDNode<"NVPTXISD::RET_GLUE", SDTNone,
 // fceil, ffloor, froundeven, ftrunc.
 
 multiclass CVT_ROUND<SDNode OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
-  def : Pat<(OpNode (f16 Float16Regs:$a)),
-            (CVT_f16_f16 Float16Regs:$a, Mode)>;
+  def : Pat<(OpNode (f16 Int16Regs:$a)),
+            (CVT_f16_f16 Int16Regs:$a, Mode)>;
   def : Pat<(OpNode Float32Regs:$a),
             (CVT_f32_f32 Float32Regs:$a, ModeFTZ)>, Requires<[doF32FTZ]>;
   def : Pat<(OpNode Float32Regs:$a),

diff  --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 16ae89cebfc8b..bfc79d383191b 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -686,46 +686,46 @@ class MIN_MAX_TUPLE<string V, Intrinsic I, NVPTXRegClass RC,
 multiclass MIN_MAX<string IntName> {
   foreach P = [
     MIN_MAX_TUPLE<"_f16", !if(!eq(IntName, "min"), int_nvvm_fmin_f16,
-      int_nvvm_fmax_f16), Float16Regs>,
+      int_nvvm_fmax_f16), Int16Regs>,
     MIN_MAX_TUPLE<"_ftz_f16", !if(!eq(IntName, "min"), int_nvvm_fmin_ftz_f16,
-      int_nvvm_fmax_ftz_f16), Float16Regs>,
+      int_nvvm_fmax_ftz_f16), Int16Regs>,
     MIN_MAX_TUPLE<"_NaN_f16", !if(!eq(IntName, "min"), int_nvvm_fmin_nan_f16,
-      int_nvvm_fmax_nan_f16), Float16Regs>,
+      int_nvvm_fmax_nan_f16), Int16Regs>,
     MIN_MAX_TUPLE<"_ftz_NaN_f16", !if(!eq(IntName, "min"),
-      int_nvvm_fmin_ftz_nan_f16, int_nvvm_fmax_ftz_nan_f16), Float16Regs>,
+      int_nvvm_fmin_ftz_nan_f16, int_nvvm_fmax_ftz_nan_f16), Int16Regs>,
     MIN_MAX_TUPLE<"_xorsign_abs_f16", !if(!eq(IntName, "min"),
       int_nvvm_fmin_xorsign_abs_f16, int_nvvm_fmax_xorsign_abs_f16),
-      Float16Regs, [hasPTX<72>, hasSM<86>]>,
+      Int16Regs, [hasPTX<72>, hasSM<86>]>,
     MIN_MAX_TUPLE<"_ftz_xorsign_abs_f16", !if(!eq(IntName, "min"),
       int_nvvm_fmin_ftz_xorsign_abs_f16, int_nvvm_fmax_ftz_xorsign_abs_f16),
-      Float16Regs, [hasPTX<72>, hasSM<86>]>,
+      Int16Regs, [hasPTX<72>, hasSM<86>]>,
     MIN_MAX_TUPLE<"_NaN_xorsign_abs_f16", !if(!eq(IntName, "min"),
       int_nvvm_fmin_nan_xorsign_abs_f16, int_nvvm_fmax_nan_xorsign_abs_f16),
-      Float16Regs, [hasPTX<72>, hasSM<86>]>,
+      Int16Regs, [hasPTX<72>, hasSM<86>]>,
     MIN_MAX_TUPLE<"_ftz_NaN_xorsign_abs_f16", !if(!eq(IntName, "min"),
       int_nvvm_fmin_ftz_nan_xorsign_abs_f16,
-      int_nvvm_fmax_ftz_nan_xorsign_abs_f16), Float16Regs, [hasPTX<72>, hasSM<86>]>,
+      int_nvvm_fmax_ftz_nan_xorsign_abs_f16), Int16Regs, [hasPTX<72>, hasSM<86>]>,
     MIN_MAX_TUPLE<"_f16x2", !if(!eq(IntName, "min"), int_nvvm_fmin_f16x2,
-      int_nvvm_fmax_f16x2), Float16x2Regs>,
+      int_nvvm_fmax_f16x2), Int32Regs>,
     MIN_MAX_TUPLE<"_ftz_f16x2", !if(!eq(IntName, "min"),
-      int_nvvm_fmin_ftz_f16x2, int_nvvm_fmax_ftz_f16x2), Float16x2Regs>,
+      int_nvvm_fmin_ftz_f16x2, int_nvvm_fmax_ftz_f16x2), Int32Regs>,
     MIN_MAX_TUPLE<"_NaN_f16x2", !if(!eq(IntName, "min"),
-      int_nvvm_fmin_nan_f16x2, int_nvvm_fmax_nan_f16x2), Float16x2Regs>,
+      int_nvvm_fmin_nan_f16x2, int_nvvm_fmax_nan_f16x2), Int32Regs>,
     MIN_MAX_TUPLE<"_ftz_NaN_f16x2", !if(!eq(IntName, "min"),
-      int_nvvm_fmin_ftz_nan_f16x2, int_nvvm_fmax_ftz_nan_f16x2), Float16x2Regs>,
+      int_nvvm_fmin_ftz_nan_f16x2, int_nvvm_fmax_ftz_nan_f16x2), Int32Regs>,
     MIN_MAX_TUPLE<"_xorsign_abs_f16x2", !if(!eq(IntName, "min"),
       int_nvvm_fmin_xorsign_abs_f16x2, int_nvvm_fmax_xorsign_abs_f16x2),
-      Float16x2Regs, [hasPTX<72>, hasSM<86>]>,
+      Int32Regs, [hasPTX<72>, hasSM<86>]>,
     MIN_MAX_TUPLE<"_ftz_xorsign_abs_f16x2", !if(!eq(IntName, "min"),
       int_nvvm_fmin_ftz_xorsign_abs_f16x2, int_nvvm_fmax_ftz_xorsign_abs_f16x2),
-      Float16x2Regs, [hasPTX<72>, hasSM<86>]>,
+      Int32Regs, [hasPTX<72>, hasSM<86>]>,
     MIN_MAX_TUPLE<"_NaN_xorsign_abs_f16x2", !if(!eq(IntName, "min"),
       int_nvvm_fmin_nan_xorsign_abs_f16x2, int_nvvm_fmax_nan_xorsign_abs_f16x2),
-      Float16x2Regs, [hasPTX<72>, hasSM<86>]>,
+      Int32Regs, [hasPTX<72>, hasSM<86>]>,
     MIN_MAX_TUPLE<"_ftz_NaN_xorsign_abs_f16x2", !if(!eq(IntName, "min"),
       int_nvvm_fmin_ftz_nan_xorsign_abs_f16x2,
       int_nvvm_fmax_ftz_nan_xorsign_abs_f16x2),
-      Float16x2Regs, [hasPTX<72>, hasSM<86>]>,
+      Int32Regs, [hasPTX<72>, hasSM<86>]>,
     MIN_MAX_TUPLE<"_bf16", !if(!eq(IntName, "min"),
       int_nvvm_fmin_bf16, int_nvvm_fmax_bf16), Int16Regs>,
     MIN_MAX_TUPLE<"_NaN_bf16", !if(!eq(IntName, "min"), int_nvvm_fmin_nan_bf16,
@@ -933,9 +933,9 @@ def INT_NVVM_EX2_APPROX_F : F_MATH_1<"ex2.approx.f32 \t$dst, $src0;",
 def INT_NVVM_EX2_APPROX_D : F_MATH_1<"ex2.approx.f64 \t$dst, $src0;",
   Float64Regs, Float64Regs, int_nvvm_ex2_approx_d>;
 def INT_NVVM_EX2_APPROX_F16 : F_MATH_1<"ex2.approx.f16 \t$dst, $src0;",
-  Float16Regs, Float16Regs, int_nvvm_ex2_approx_f16, [hasPTX<70>, hasSM<75>]>;
+  Int16Regs, Int16Regs, int_nvvm_ex2_approx_f16, [hasPTX<70>, hasSM<75>]>;
 def INT_NVVM_EX2_APPROX_F16X2 : F_MATH_1<"ex2.approx.f16x2 \t$dst, $src0;",
-  Float16x2Regs, Float16x2Regs, int_nvvm_ex2_approx_f16x2, [hasPTX<70>, hasSM<75>]>;
+  Int32Regs, Int32Regs, int_nvvm_ex2_approx_f16x2, [hasPTX<70>, hasSM<75>]>;
 
 def INT_NVVM_LG2_APPROX_FTZ_F : F_MATH_1<"lg2.approx.ftz.f32 \t$dst, $src0;",
   Float32Regs, Float32Regs, int_nvvm_lg2_approx_ftz_f>;
@@ -986,30 +986,30 @@ multiclass FMA_INST {
     FMA_TUPLE<"_rp_f32", int_nvvm_fma_rp_f, Float32Regs>,
     FMA_TUPLE<"_rp_ftz_f32", int_nvvm_fma_rp_ftz_f, Float32Regs>,
 
-    FMA_TUPLE<"_rn_f16", int_nvvm_fma_rn_f16, Float16Regs, [hasPTX<42>, hasSM<53>]>,
-    FMA_TUPLE<"_rn_ftz_f16", int_nvvm_fma_rn_ftz_f16, Float16Regs,
+    FMA_TUPLE<"_rn_f16", int_nvvm_fma_rn_f16, Int16Regs, [hasPTX<42>, hasSM<53>]>,
+    FMA_TUPLE<"_rn_ftz_f16", int_nvvm_fma_rn_ftz_f16, Int16Regs,
       [hasPTX<42>, hasSM<53>]>,
-    FMA_TUPLE<"_rn_sat_f16", int_nvvm_fma_rn_sat_f16, Float16Regs,
+    FMA_TUPLE<"_rn_sat_f16", int_nvvm_fma_rn_sat_f16, Int16Regs,
       [hasPTX<42>, hasSM<53>]>,
-    FMA_TUPLE<"_rn_ftz_sat_f16", int_nvvm_fma_rn_ftz_sat_f16, Float16Regs,
+    FMA_TUPLE<"_rn_ftz_sat_f16", int_nvvm_fma_rn_ftz_sat_f16, Int16Regs,
       [hasPTX<42>, hasSM<53>]>,
-    FMA_TUPLE<"_rn_relu_f16", int_nvvm_fma_rn_relu_f16, Float16Regs,
+    FMA_TUPLE<"_rn_relu_f16", int_nvvm_fma_rn_relu_f16, Int16Regs,
       [hasPTX<70>, hasSM<80>]>,
-    FMA_TUPLE<"_rn_ftz_relu_f16", int_nvvm_fma_rn_ftz_relu_f16, Float16Regs,
+    FMA_TUPLE<"_rn_ftz_relu_f16", int_nvvm_fma_rn_ftz_relu_f16, Int16Regs,
       [hasPTX<70>, hasSM<80>]>,
 
-    FMA_TUPLE<"_rn_f16x2", int_nvvm_fma_rn_f16x2, Float16x2Regs,
+    FMA_TUPLE<"_rn_f16x2", int_nvvm_fma_rn_f16x2, Int32Regs,
       [hasPTX<42>, hasSM<53>]>,
-    FMA_TUPLE<"_rn_ftz_f16x2", int_nvvm_fma_rn_ftz_f16x2, Float16x2Regs,
+    FMA_TUPLE<"_rn_ftz_f16x2", int_nvvm_fma_rn_ftz_f16x2, Int32Regs,
       [hasPTX<42>, hasSM<53>]>,
-    FMA_TUPLE<"_rn_sat_f16x2", int_nvvm_fma_rn_sat_f16x2, Float16x2Regs,
+    FMA_TUPLE<"_rn_sat_f16x2", int_nvvm_fma_rn_sat_f16x2, Int32Regs,
       [hasPTX<42>, hasSM<53>]>,
     FMA_TUPLE<"_rn_ftz_sat_f16x2", int_nvvm_fma_rn_ftz_sat_f16x2,
-      Float16x2Regs, [hasPTX<42>, hasSM<53>]>,
-    FMA_TUPLE<"_rn_relu_f16x2", int_nvvm_fma_rn_relu_f16x2, Float16x2Regs,
+      Int32Regs, [hasPTX<42>, hasSM<53>]>,
+    FMA_TUPLE<"_rn_relu_f16x2", int_nvvm_fma_rn_relu_f16x2, Int32Regs,
       [hasPTX<70>, hasSM<80>]>,
     FMA_TUPLE<"_rn_ftz_relu_f16x2", int_nvvm_fma_rn_ftz_relu_f16x2,
-      Float16x2Regs, [hasPTX<70>, hasSM<80>]>,
+      Int32Regs, [hasPTX<70>, hasSM<80>]>,
 
     FMA_TUPLE<"_rn_bf16", int_nvvm_fma_rn_bf16, Int16Regs, [hasPTX<70>, hasSM<80>]>,
     FMA_TUPLE<"_rn_relu_bf16", int_nvvm_fma_rn_relu_bf16, Int16Regs,
@@ -1397,9 +1397,9 @@ def : Pat<(int_nvvm_ull2d_rp Int64Regs:$a),
 
 
 def : Pat<(int_nvvm_f2h_rn_ftz Float32Regs:$a),
-          (BITCONVERT_16_F2I (CVT_f16_f32 Float32Regs:$a, CvtRN_FTZ))>;
+          (CVT_f16_f32 Float32Regs:$a, CvtRN_FTZ)>;
 def : Pat<(int_nvvm_f2h_rn Float32Regs:$a),
-          (BITCONVERT_16_F2I (CVT_f16_f32 Float32Regs:$a, CvtRN))>;
+          (CVT_f16_f32 Float32Regs:$a, CvtRN)>;
 
 //
 // Bitcast
@@ -2159,12 +2159,8 @@ defm INT_PTX_LDU_GLOBAL_i8  : LDU_G<"u8 \t$result, [$src];", Int16Regs>;
 defm INT_PTX_LDU_GLOBAL_i16 : LDU_G<"u16 \t$result, [$src];", Int16Regs>;
 defm INT_PTX_LDU_GLOBAL_i32 : LDU_G<"u32 \t$result, [$src];", Int32Regs>;
 defm INT_PTX_LDU_GLOBAL_i64 : LDU_G<"u64 \t$result, [$src];", Int64Regs>;
-defm INT_PTX_LDU_GLOBAL_f16 : LDU_G<"b16 \t$result, [$src];", Float16Regs>;
-defm INT_PTX_LDU_GLOBAL_f16x2 : LDU_G<"b32 \t$result, [$src];", Float16x2Regs>;
 defm INT_PTX_LDU_GLOBAL_f32 : LDU_G<"f32 \t$result, [$src];", Float32Regs>;
 defm INT_PTX_LDU_GLOBAL_f64 : LDU_G<"f64 \t$result, [$src];", Float64Regs>;
-defm INT_PTX_LDU_GLOBAL_p32 : LDU_G<"u32 \t$result, [$src];", Int32Regs>;
-defm INT_PTX_LDU_GLOBAL_p64 : LDU_G<"u64 \t$result, [$src];", Int64Regs>;
 
 // vector
 
@@ -2212,9 +2208,9 @@ defm INT_PTX_LDU_G_v2i16_ELE
 defm INT_PTX_LDU_G_v2i32_ELE
   : VLDU_G_ELE_V2<"v2.u32 \t{{$dst1, $dst2}}, [$src];", Int32Regs>;
 defm INT_PTX_LDU_G_v2f16_ELE
-  : VLDU_G_ELE_V2<"v2.b16 \t{{$dst1, $dst2}}, [$src];", Float16Regs>;
+  : VLDU_G_ELE_V2<"v2.b16 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
 defm INT_PTX_LDU_G_v2f16x2_ELE
-  : VLDU_G_ELE_V2<"v2.b32 \t{{$dst1, $dst2}}, [$src];", Float16x2Regs>;
+  : VLDU_G_ELE_V2<"v2.b32 \t{{$dst1, $dst2}}, [$src];", Int32Regs>;
 defm INT_PTX_LDU_G_v2f32_ELE
   : VLDU_G_ELE_V2<"v2.f32 \t{{$dst1, $dst2}}, [$src];", Float32Regs>;
 defm INT_PTX_LDU_G_v2i64_ELE
@@ -2231,10 +2227,10 @@ defm INT_PTX_LDU_G_v4i32_ELE
     Int32Regs>;
 defm INT_PTX_LDU_G_v4f16_ELE
   : VLDU_G_ELE_V4<"v4.b16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
-    Float16Regs>;
+    Int16Regs>;
 defm INT_PTX_LDU_G_v4f16x2_ELE
   : VLDU_G_ELE_V4<"v4.b32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
-    Float16x2Regs>;
+    Int32Regs>;
 defm INT_PTX_LDU_G_v4f32_ELE
   : VLDU_G_ELE_V4<"v4.f32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
     Float32Regs>;
@@ -2274,18 +2270,10 @@ defm INT_PTX_LDG_GLOBAL_i32
   : LDG_G<"u32 \t$result, [$src];", Int32Regs>;
 defm INT_PTX_LDG_GLOBAL_i64
   : LDG_G<"u64 \t$result, [$src];", Int64Regs>;
-defm INT_PTX_LDG_GLOBAL_f16
-  : LDG_G<"b16 \t$result, [$src];", Float16Regs>;
-defm INT_PTX_LDG_GLOBAL_f16x2
-  : LDG_G<"b32 \t$result, [$src];", Float16x2Regs>;
 defm INT_PTX_LDG_GLOBAL_f32
   : LDG_G<"f32 \t$result, [$src];", Float32Regs>;
 defm INT_PTX_LDG_GLOBAL_f64
   : LDG_G<"f64 \t$result, [$src];", Float64Regs>;
-defm INT_PTX_LDG_GLOBAL_p32
-  : LDG_G<"u32 \t$result, [$src];", Int32Regs>;
-defm INT_PTX_LDG_GLOBAL_p64
-  : LDG_G<"u64 \t$result, [$src];", Int64Regs>;
 
 // vector
 
@@ -2333,10 +2321,6 @@ defm INT_PTX_LDG_G_v2i16_ELE
   : VLDG_G_ELE_V2<"v2.u16 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
 defm INT_PTX_LDG_G_v2i32_ELE
   : VLDG_G_ELE_V2<"v2.u32 \t{{$dst1, $dst2}}, [$src];", Int32Regs>;
-defm INT_PTX_LDG_G_v2f16_ELE
-  : VLDG_G_ELE_V2<"v2.b16 \t{{$dst1, $dst2}}, [$src];", Float16Regs>;
-defm INT_PTX_LDG_G_v2f16x2_ELE
-  : VLDG_G_ELE_V2<"v2.b32 \t{{$dst1, $dst2}}, [$src];", Float16x2Regs>;
 defm INT_PTX_LDG_G_v2f32_ELE
   : VLDG_G_ELE_V2<"v2.f32 \t{{$dst1, $dst2}}, [$src];", Float32Regs>;
 defm INT_PTX_LDG_G_v2i64_ELE
@@ -2349,10 +2333,6 @@ defm INT_PTX_LDG_G_v4i16_ELE
   : VLDG_G_ELE_V4<"v4.u16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int16Regs>;
 defm INT_PTX_LDG_G_v4i32_ELE
   : VLDG_G_ELE_V4<"v4.u32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int32Regs>;
-defm INT_PTX_LDG_G_v4f16_ELE
-  : VLDG_G_ELE_V4<"v4.b16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Float16Regs>;
-defm INT_PTX_LDG_G_v4f16x2_ELE
-  : VLDG_G_ELE_V4<"v4.b32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Float16x2Regs>;
 defm INT_PTX_LDG_G_v4f32_ELE
   : VLDG_G_ELE_V4<"v4.f32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Float32Regs>;
 
@@ -6305,7 +6285,7 @@ class WMMA_REGINFO<WMMA_REGS r, string op>
       : WMMA_REGS<r.geom, r.frag, r.ptx_elt_type> {
   // NVPTX register types used to carry fragment data.
   NVPTXRegClass regclass = !cond(
-    !eq(ptx_elt_type, "f16") : Float16x2Regs,
+    !eq(ptx_elt_type, "f16") : Int32Regs,
     !eq(ptx_elt_type, "f32") : Float32Regs,
     !eq(ptx_elt_type, "f64") : Float64Regs,
     !eq(ptx_elt_type, "bf16") : Int32Regs,

diff  --git a/llvm/lib/Target/NVPTX/NVPTXProxyRegErasure.cpp b/llvm/lib/Target/NVPTX/NVPTXProxyRegErasure.cpp
index 869231ff4ffe9..258ae97a20d58 100644
--- a/llvm/lib/Target/NVPTX/NVPTXProxyRegErasure.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXProxyRegErasure.cpp
@@ -73,10 +73,6 @@ bool NVPTXProxyRegErasure::runOnMachineFunction(MachineFunction &MF) {
       case NVPTX::ProxyRegI16:
       case NVPTX::ProxyRegI32:
       case NVPTX::ProxyRegI64:
-      case NVPTX::ProxyRegF16:
-      case NVPTX::ProxyRegF16x2:
-      case NVPTX::ProxyRegBF16:
-      case NVPTX::ProxyRegBF16x2:
       case NVPTX::ProxyRegF32:
       case NVPTX::ProxyRegF64:
         replaceMachineInstructionUsage(MF, MI);

diff  --git a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
index 6e4208d272412..f1213f030bba7 100644
--- a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
@@ -29,14 +29,6 @@ namespace llvm {
 std::string getNVPTXRegClassName(TargetRegisterClass const *RC) {
   if (RC == &NVPTX::Float32RegsRegClass)
     return ".f32";
-  if (RC == &NVPTX::Float16RegsRegClass)
-    // Ideally fp16 registers should be .f16, but this syntax is only
-    // supported on sm_53+. On the other hand, .b16 registers are
-    // accepted for all supported fp16 instructions on all GPU
-    // variants, so we can use them instead.
-    return ".b16";
-  if (RC == &NVPTX::Float16x2RegsRegClass)
-    return ".b32";
   if (RC == &NVPTX::Float64RegsRegClass)
     return ".f64";
   if (RC == &NVPTX::Int64RegsRegClass)
@@ -73,10 +65,6 @@ std::string getNVPTXRegClassName(TargetRegisterClass const *RC) {
 std::string getNVPTXRegClassStr(TargetRegisterClass const *RC) {
   if (RC == &NVPTX::Float32RegsRegClass)
     return "%f";
-  if (RC == &NVPTX::Float16RegsRegClass)
-    return "%h";
-  if (RC == &NVPTX::Float16x2RegsRegClass)
-    return "%hh";
   if (RC == &NVPTX::Float64RegsRegClass)
     return "%fd";
   if (RC == &NVPTX::Int64RegsRegClass)

diff  --git a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
index 31d5441e58b38..b62460e8cd31f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
@@ -57,11 +57,11 @@ foreach i = 0...31 in {
 //  Register classes
 //===----------------------------------------------------------------------===//
 def Int1Regs : NVPTXRegClass<[i1], 8, (add (sequence "P%u", 0, 4))>;
-def Int16Regs : NVPTXRegClass<[i16], 16, (add (sequence "RS%u", 0, 4))>;
-def Int32Regs : NVPTXRegClass<[i32], 32, (add (sequence "R%u", 0, 4), VRFrame32, VRFrameLocal32)>;
+def Int16Regs : NVPTXRegClass<[i16, f16, bf16], 16, (add (sequence "RS%u", 0, 4))>;
+def Int32Regs : NVPTXRegClass<[i32, v2f16, v2bf16], 32,
+                              (add (sequence "R%u", 0, 4),
+                              VRFrame32, VRFrameLocal32)>;
 def Int64Regs : NVPTXRegClass<[i64], 64, (add (sequence "RL%u", 0, 4), VRFrame64, VRFrameLocal64)>;
-def Float16Regs : NVPTXRegClass<[f16,bf16], 16, (add (sequence "H%u", 0, 4))>;
-def Float16x2Regs : NVPTXRegClass<[v2f16,v2bf16], 32, (add (sequence "HH%u", 0, 4))>;
 def Float32Regs : NVPTXRegClass<[f32], 32, (add (sequence "F%u", 0, 4))>;
 def Float64Regs : NVPTXRegClass<[f64], 64, (add (sequence "FL%u", 0, 4))>;
 def Int32ArgRegs : NVPTXRegClass<[i32], 32, (add (sequence "ia%u", 0, 4))>;

diff  --git a/llvm/test/CodeGen/NVPTX/bf16.ll b/llvm/test/CodeGen/NVPTX/bf16.ll
index 80113f510a05a..c2cf804c5013a 100644
--- a/llvm/test/CodeGen/NVPTX/bf16.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16.ll
@@ -7,7 +7,7 @@
 
 define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; CHECK-LABEL: @test_load_store
-; CHECK: ld.global.b16 [[TMP:%h[0-9]+]], [{{%r[0-9]+}}]
+; CHECK: ld.global.b16 [[TMP:%rs[0-9]+]], [{{%r[0-9]+}}]
 ; CHECK: st.global.b16 [{{%r[0-9]+}}], [[TMP]]
   %val = load bfloat, ptr addrspace(1) %in
   store bfloat %val, ptr addrspace(1) %out
@@ -16,7 +16,7 @@ define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 
 define void @test_bitcast_from_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; CHECK-LABEL: @test_bitcast_from_bfloat
-; CHECK: ld.global.b16 [[TMP:%h[0-9]+]], [{{%r[0-9]+}}]
+; CHECK: ld.global.b16 [[TMP:%rs[0-9]+]], [{{%r[0-9]+}}]
 ; CHECK: st.global.b16 [{{%r[0-9]+}}], [[TMP]]
   %val = load bfloat, ptr addrspace(1) %in
   %val_int = bitcast bfloat %val to i16

diff  --git a/llvm/test/CodeGen/NVPTX/call_bitcast_byval.ll b/llvm/test/CodeGen/NVPTX/call_bitcast_byval.ll
index 0a411f4ef4d5a..56f9c905caa91 100644
--- a/llvm/test/CodeGen/NVPTX/call_bitcast_byval.ll
+++ b/llvm/test/CodeGen/NVPTX/call_bitcast_byval.ll
@@ -14,8 +14,8 @@ target triple = "nvptx64-nvidia-cuda"
 %complex_half = type { half, half }
 
 ; CHECK: .param .align 2 .b8 param2[4];
-; CHECK: st.param.b16   [param2+0], %h1;
-; CHECK: st.param.b16   [param2+2], %h2;
+; CHECK: st.param.b16   [param2+0], %rs1;
+; CHECK: st.param.b16   [param2+2], %rs2;
 ; CHECK: .param .align 2 .b8 retval0[4];
 ; CHECK: call.uni (retval0),
 ; CHECK-NEXT: _Z20__spirv_GroupCMulKHRjjN5__spv12complex_halfE,
@@ -37,8 +37,8 @@ define internal void @callee(ptr byval(%"class.complex") %byval_arg) {
 define void @boom() {
   %fp = call ptr @usefp(ptr @callee)
   ; CHECK: .param .align 2 .b8 param0[4];
-  ; CHECK: st.param.b16 [param0+0], %h1;
-  ; CHECK: st.param.b16 [param0+2], %h2;
+  ; CHECK: st.param.b16 [param0+0], %rs1;
+  ; CHECK: st.param.b16 [param0+2], %rs2;
   ; CHECK: .callprototype ()_ (.param .align 2 .b8 _[4]);
   call void %fp(ptr byval(%"class.complex") null)
   ret void

diff  --git a/llvm/test/CodeGen/NVPTX/f16-instructions.ll b/llvm/test/CodeGen/NVPTX/f16-instructions.ll
index 0b994a7406773..55fde7837487b 100644
--- a/llvm/test/CodeGen/NVPTX/f16-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f16-instructions.ll
@@ -43,7 +43,7 @@
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 
 ; CHECK-LABEL: test_ret_const(
-; CHECK:      mov.b16         [[R:%h[0-9]+]], 0x3C00;
+; CHECK:      mov.b16         [[R:%rs[0-9]+]], 0x3C00;
 ; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
 ; CHECK-NEXT: ret;
 define half @test_ret_const() #0 {
@@ -51,14 +51,14 @@ define half @test_ret_const() #0 {
 }
 
 ; CHECK-LABEL: test_fadd(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fadd_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fadd_param_1];
-; CHECK-F16-NOFTZ-NEXT:   add.rn.f16     [[R:%h[0-9]+]], [[A]], [[B]];
-; CHECK-F16-FTZ-NEXT:   add.rn.ftz.f16     [[R:%h[0-9]+]], [[A]], [[B]];
+; CHECK-DAG:  ld.param.b16    [[A:%rs[0-9]+]], [test_fadd_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_fadd_param_1];
+; CHECK-F16-NOFTZ-NEXT:   add.rn.f16     [[R:%rs[0-9]+]], [[A]], [[B]];
+; CHECK-F16-FTZ-NEXT:   add.rn.ftz.f16     [[R:%rs[0-9]+]], [[A]], [[B]];
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[A32:%f[0-9]+]], [[A]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%f[0-9]+]], [[B]]
 ; CHECK-NOF16-NEXT: add.rn.f32     [[R32:%f[0-9]+]], [[A32]], [[B32]];
-; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[R32]]
 ; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
 ; CHECK-NEXT: ret;
 define half @test_fadd(half %a, half %b) #0 {
@@ -67,14 +67,14 @@ define half @test_fadd(half %a, half %b) #0 {
 }
 
 ; CHECK-LABEL: test_fadd_v1f16(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fadd_v1f16_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fadd_v1f16_param_1];
-; CHECK-F16-NOFTZ-NEXT:   add.rn.f16     [[R:%h[0-9]+]], [[A]], [[B]];
-; CHECK-F16-FTZ-NEXT:   add.rn.ftz.f16     [[R:%h[0-9]+]], [[A]], [[B]];
+; CHECK-DAG:  ld.param.b16    [[A:%rs[0-9]+]], [test_fadd_v1f16_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_fadd_v1f16_param_1];
+; CHECK-F16-NOFTZ-NEXT:   add.rn.f16     [[R:%rs[0-9]+]], [[A]], [[B]];
+; CHECK-F16-FTZ-NEXT:   add.rn.ftz.f16     [[R:%rs[0-9]+]], [[A]], [[B]];
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[A32:%f[0-9]+]], [[A]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%f[0-9]+]], [[B]]
 ; CHECK-NOF16-NEXT: add.rn.f32     [[R32:%f[0-9]+]], [[A32]], [[B32]];
-; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[R32]]
 ; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
 ; CHECK-NEXT: ret;
 define <1 x half> @test_fadd_v1f16(<1 x half> %a, <1 x half> %b) #0 {
@@ -84,14 +84,14 @@ define <1 x half> @test_fadd_v1f16(<1 x half> %a, <1 x half> %b) #0 {
 
 ; Check that we can lower fadd with immediate arguments.
 ; CHECK-LABEL: test_fadd_imm_0(
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fadd_imm_0_param_0];
-; CHECK-F16-NOFTZ-DAG:    mov.b16        [[A:%h[0-9]+]], 0x3C00;
-; CHECK-F16-NOFTZ-NEXT:   add.rn.f16     [[R:%h[0-9]+]], [[B]], [[A]];
-; CHECK-F16-FTZ-DAG:    mov.b16        [[A:%h[0-9]+]], 0x3C00;
-; CHECK-F16-FTZ-NEXT:   add.rn.ftz.f16     [[R:%h[0-9]+]], [[B]], [[A]];
+; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_fadd_imm_0_param_0];
+; CHECK-F16-NOFTZ-DAG:    mov.b16        [[A:%rs[0-9]+]], 0x3C00;
+; CHECK-F16-NOFTZ-NEXT:   add.rn.f16     [[R:%rs[0-9]+]], [[B]], [[A]];
+; CHECK-F16-FTZ-DAG:    mov.b16        [[A:%rs[0-9]+]], 0x3C00;
+; CHECK-F16-FTZ-NEXT:   add.rn.ftz.f16     [[R:%rs[0-9]+]], [[B]], [[A]];
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%f[0-9]+]], [[B]]
 ; CHECK-NOF16-NEXT: add.rn.f32     [[R32:%f[0-9]+]], [[B32]], 0f3F800000;
-; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[R32]]
 ; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
 ; CHECK-NEXT: ret;
 define half @test_fadd_imm_0(half %b) #0 {
@@ -100,14 +100,14 @@ define half @test_fadd_imm_0(half %b) #0 {
 }
 
 ; CHECK-LABEL: test_fadd_imm_1(
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fadd_imm_1_param_0];
-; CHECK-F16-NOFTZ-DAG:    mov.b16        [[A:%h[0-9]+]], 0x3C00;
-; CHECK-F16-NOFTZ-NEXT:   add.rn.f16     [[R:%h[0-9]+]], [[B]], [[A]];
-; CHECK-F16-FTZ-DAG:    mov.b16        [[A:%h[0-9]+]], 0x3C00;
-; CHECK-F16-FTZ-NEXT:   add.rn.ftz.f16     [[R:%h[0-9]+]], [[B]], [[A]];
+; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_fadd_imm_1_param_0];
+; CHECK-F16-NOFTZ-DAG:    mov.b16        [[A:%rs[0-9]+]], 0x3C00;
+; CHECK-F16-NOFTZ-NEXT:   add.rn.f16     [[R:%rs[0-9]+]], [[B]], [[A]];
+; CHECK-F16-FTZ-DAG:    mov.b16        [[A:%rs[0-9]+]], 0x3C00;
+; CHECK-F16-FTZ-NEXT:   add.rn.ftz.f16     [[R:%rs[0-9]+]], [[B]], [[A]];
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%f[0-9]+]], [[B]]
 ; CHECK-NOF16-NEXT: add.rn.f32     [[R32:%f[0-9]+]], [[B32]], 0f3F800000;
-; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[R32]]
 ; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
 ; CHECK-NEXT: ret;
 define half @test_fadd_imm_1(half %a) #0 {
@@ -116,14 +116,14 @@ define half @test_fadd_imm_1(half %a) #0 {
 }
 
 ; CHECK-LABEL: test_fsub(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fsub_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fsub_param_1];
-; CHECK-F16-NOFTZ-NEXT:   sub.rn.f16     [[R:%h[0-9]+]], [[A]], [[B]];
-; CHECK-F16-FTZ-NEXT:   sub.rn.ftz.f16     [[R:%h[0-9]+]], [[A]], [[B]];
+; CHECK-DAG:  ld.param.b16    [[A:%rs[0-9]+]], [test_fsub_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_fsub_param_1];
+; CHECK-F16-NOFTZ-NEXT:   sub.rn.f16     [[R:%rs[0-9]+]], [[A]], [[B]];
+; CHECK-F16-FTZ-NEXT:   sub.rn.ftz.f16     [[R:%rs[0-9]+]], [[A]], [[B]];
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[A32:%f[0-9]+]], [[A]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%f[0-9]+]], [[B]]
 ; CHECK-NOF16-NEXT: sub.rn.f32     [[R32:%f[0-9]+]], [[A32]], [[B32]];
-; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[R32]]
 ; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
 ; CHECK-NEXT: ret;
 define half @test_fsub(half %a, half %b) #0 {
@@ -132,15 +132,15 @@ define half @test_fsub(half %a, half %b) #0 {
 }
 
 ; CHECK-LABEL: test_fneg(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fneg_param_0];
-; CHECK-F16-NOFTZ-NEXT:   mov.b16        [[Z:%h[0-9]+]], 0x0000
-; CHECK-F16-NOFTZ-NEXT:   sub.rn.f16     [[R:%h[0-9]+]], [[Z]], [[A]];
-; CHECK-F16-FTZ-NEXT:   mov.b16        [[Z:%h[0-9]+]], 0x0000
-; CHECK-F16-FTZ-NEXT:   sub.rn.ftz.f16     [[R:%h[0-9]+]], [[Z]], [[A]];
+; CHECK-DAG:  ld.param.b16    [[A:%rs[0-9]+]], [test_fneg_param_0];
+; CHECK-F16-NOFTZ-NEXT:   mov.b16        [[Z:%rs[0-9]+]], 0x0000
+; CHECK-F16-NOFTZ-NEXT:   sub.rn.f16     [[R:%rs[0-9]+]], [[Z]], [[A]];
+; CHECK-F16-FTZ-NEXT:   mov.b16        [[Z:%rs[0-9]+]], 0x0000
+; CHECK-F16-FTZ-NEXT:   sub.rn.ftz.f16     [[R:%rs[0-9]+]], [[Z]], [[A]];
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[A32:%f[0-9]+]], [[A]]
 ; CHECK-NOF16-DAG:  mov.f32        [[Z:%f[0-9]+]], 0f00000000;
 ; CHECK-NOF16-NEXT: sub.rn.f32     [[R32:%f[0-9]+]], [[Z]], [[A32]];
-; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[R32]]
 ; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
 ; CHECK-NEXT: ret;
 define half @test_fneg(half %a) #0 {
@@ -149,14 +149,14 @@ define half @test_fneg(half %a) #0 {
 }
 
 ; CHECK-LABEL: test_fmul(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fmul_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fmul_param_1];
-; CHECK-F16-NOFTZ-NEXT: mul.rn.f16      [[R:%h[0-9]+]], [[A]], [[B]];
-; CHECK-F16-FTZ-NEXT: mul.rn.ftz.f16      [[R:%h[0-9]+]], [[A]], [[B]];
+; CHECK-DAG:  ld.param.b16    [[A:%rs[0-9]+]], [test_fmul_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_fmul_param_1];
+; CHECK-F16-NOFTZ-NEXT: mul.rn.f16      [[R:%rs[0-9]+]], [[A]], [[B]];
+; CHECK-F16-FTZ-NEXT: mul.rn.ftz.f16      [[R:%rs[0-9]+]], [[A]], [[B]];
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[A32:%f[0-9]+]], [[A]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%f[0-9]+]], [[B]]
 ; CHECK-NOF16-NEXT: mul.rn.f32     [[R32:%f[0-9]+]], [[A32]], [[B32]];
-; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[R32]]
 ; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
 ; CHECK-NEXT: ret;
 define half @test_fmul(half %a, half %b) #0 {
@@ -165,15 +165,15 @@ define half @test_fmul(half %a, half %b) #0 {
 }
 
 ; CHECK-LABEL: test_fdiv(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fdiv_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fdiv_param_1];
+; CHECK-DAG:  ld.param.b16    [[A:%rs[0-9]+]], [test_fdiv_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_fdiv_param_1];
 ; CHECK-NOFTZ-DAG:  cvt.f32.f16     [[F0:%f[0-9]+]], [[A]];
 ; CHECK-NOFTZ-DAG:  cvt.f32.f16     [[F1:%f[0-9]+]], [[B]];
 ; CHECK-NOFTZ-NEXT: div.rn.f32      [[FR:%f[0-9]+]], [[F0]], [[F1]];
 ; CHECK-F16-FTZ-DAG:  cvt.ftz.f32.f16     [[F0:%f[0-9]+]], [[A]];
 ; CHECK-F16-FTZ-DAG:  cvt.ftz.f32.f16     [[F1:%f[0-9]+]], [[B]];
 ; CHECK-F16-FTZ-NEXT: div.rn.ftz.f32      [[FR:%f[0-9]+]], [[F0]], [[F1]];
-; CHECK-NEXT: cvt.rn.f16.f32  [[R:%h[0-9]+]], [[FR]];
+; CHECK-NEXT: cvt.rn.f16.f32  [[R:%rs[0-9]+]], [[FR]];
 ; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
 ; CHECK-NEXT: ret;
 define half @test_fdiv(half %a, half %b) #0 {
@@ -182,8 +182,8 @@ define half @test_fdiv(half %a, half %b) #0 {
 }
 
 ; CHECK-LABEL: test_frem(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_frem_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_frem_param_1];
+; CHECK-DAG:  ld.param.b16    [[A:%rs[0-9]+]], [test_frem_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_frem_param_1];
 ; CHECK-NOFTZ-DAG:  cvt.f32.f16     [[FA:%f[0-9]+]], [[A]];
 ; CHECK-NOFTZ-DAG:  cvt.f32.f16     [[FB:%f[0-9]+]], [[B]];
 ; CHECK-NOFTZ-NEXT: div.rn.f32      [[D:%f[0-9]+]], [[FA]], [[FB]];
@@ -198,7 +198,7 @@ define half @test_fdiv(half %a, half %b) #0 {
 ; CHECK-F16-FTZ-NEXT: sub.ftz.f32         [[RF:%f[0-9]+]], [[FA]], [[RI]];
 ; CHECK-NEXT: testp.infinite.f32 [[ISBINF:%p[0-9]+]], [[FB]];
 ; CHECK-NEXT: selp.f32           [[RESULT:%f[0-9]+]], [[FA]], [[RF]], [[ISBINF]];
-; CHECK-NEXT: cvt.rn.f16.f32     [[R:%h[0-9]+]], [[RESULT]];
+; CHECK-NEXT: cvt.rn.f16.f32     [[R:%rs[0-9]+]], [[RESULT]];
 ; CHECK-NEXT: st.param.b16       [func_retval0+0], [[R]];
 ; CHECK-NEXT: ret;
 define half @test_frem(half %a, half %b) #0 {
@@ -207,7 +207,7 @@ define half @test_frem(half %a, half %b) #0 {
 }
 
 ; CHECK-LABEL: test_store(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_store_param_0];
+; CHECK-DAG:  ld.param.b16    [[A:%rs[0-9]+]], [test_store_param_0];
 ; CHECK-DAG:  ld.param.u64    %[[PTR:rd[0-9]+]], [test_store_param_1];
 ; CHECK-NEXT: st.b16          [%[[PTR]]], [[A]];
 ; CHECK-NEXT: ret;
@@ -218,7 +218,7 @@ define void @test_store(half %a, ptr %b) #0 {
 
 ; CHECK-LABEL: test_load(
 ; CHECK:      ld.param.u64    %[[PTR:rd[0-9]+]], [test_load_param_0];
-; CHECK-NEXT: ld.b16          [[R:%h[0-9]+]], [%[[PTR]]];
+; CHECK-NEXT: ld.b16          [[R:%rs[0-9]+]], [%[[PTR]]];
 ; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
 ; CHECK-NEXT: ret;
 define half @test_load(ptr %a) #0 {
@@ -243,8 +243,8 @@ define void @test_halfp0a1(ptr noalias readonly %from, ptr %to) {
 declare half @test_callee(half %a, half %b) #0
 
 ; CHECK-LABEL: test_call(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_call_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_call_param_1];
+; CHECK-DAG:  ld.param.b16    [[A:%rs[0-9]+]], [test_call_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_call_param_1];
 ; CHECK:      {
 ; CHECK-DAG:  .param .b32 param0;
 ; CHECK-DAG:  .param .b32 param1;
@@ -254,7 +254,7 @@ declare half @test_callee(half %a, half %b) #0
 ; CHECK:      call.uni (retval0),
 ; CHECK-NEXT:        test_callee,
 ; CHECK:      );
-; CHECK-NEXT: ld.param.b16    [[R:%h[0-9]+]], [retval0+0];
+; CHECK-NEXT: ld.param.b16    [[R:%rs[0-9]+]], [retval0+0];
 ; CHECK-NEXT: }
 ; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
 ; CHECK-NEXT: ret;
@@ -264,8 +264,8 @@ define half @test_call(half %a, half %b) #0 {
 }
 
 ; CHECK-LABEL: test_call_flipped(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_call_flipped_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_call_flipped_param_1];
+; CHECK-DAG:  ld.param.b16    [[A:%rs[0-9]+]], [test_call_flipped_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_call_flipped_param_1];
 ; CHECK:      {
 ; CHECK-DAG:  .param .b32 param0;
 ; CHECK-DAG:  .param .b32 param1;
@@ -275,7 +275,7 @@ define half @test_call(half %a, half %b) #0 {
 ; CHECK:      call.uni (retval0),
 ; CHECK-NEXT:        test_callee,
 ; CHECK:      );
-; CHECK-NEXT: ld.param.b16    [[R:%h[0-9]+]], [retval0+0];
+; CHECK-NEXT: ld.param.b16    [[R:%rs[0-9]+]], [retval0+0];
 ; CHECK-NEXT: }
 ; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
 ; CHECK-NEXT: ret;
@@ -285,8 +285,8 @@ define half @test_call_flipped(half %a, half %b) #0 {
 }
 
 ; CHECK-LABEL: test_tailcall_flipped(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_tailcall_flipped_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_tailcall_flipped_param_1];
+; CHECK-DAG:  ld.param.b16    [[A:%rs[0-9]+]], [test_tailcall_flipped_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_tailcall_flipped_param_1];
 ; CHECK:      {
 ; CHECK-DAG:  .param .b32 param0;
 ; CHECK-DAG:  .param .b32 param1;
@@ -296,7 +296,7 @@ define half @test_call_flipped(half %a, half %b) #0 {
 ; CHECK:      call.uni (retval0),
 ; CHECK-NEXT:        test_callee,
 ; CHECK:      );
-; CHECK-NEXT: ld.param.b16    [[R:%h[0-9]+]], [retval0+0];
+; CHECK-NEXT: ld.param.b16    [[R:%rs[0-9]+]], [retval0+0];
 ; CHECK-NEXT: }
 ; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
 ; CHECK-NEXT: ret;
@@ -306,10 +306,10 @@ define half @test_tailcall_flipped(half %a, half %b) #0 {
 }
 
 ; CHECK-LABEL: test_select(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_select_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_select_param_1];
+; CHECK-DAG:  ld.param.b16    [[A:%rs[0-9]+]], [test_select_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_select_param_1];
 ; CHECK-DAG:  setp.eq.b16     [[PRED:%p[0-9]+]], %rs{{.*}}, 1;
-; CHECK-NEXT: selp.b16        [[R:%h[0-9]+]], [[A]], [[B]], [[PRED]];
+; CHECK-NEXT: selp.b16        [[R:%rs[0-9]+]], [[A]], [[B]], [[PRED]];
 ; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
 ; CHECK-NEXT: ret;
 define half @test_select(half %a, half %b, i1 zeroext %c) #0 {
@@ -318,15 +318,15 @@ define half @test_select(half %a, half %b, i1 zeroext %c) #0 {
 }
 
 ; CHECK-LABEL: test_select_cc(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_select_cc_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_select_cc_param_1];
-; CHECK-DAG:  ld.param.b16    [[C:%h[0-9]+]], [test_select_cc_param_2];
-; CHECK-DAG:  ld.param.b16    [[D:%h[0-9]+]], [test_select_cc_param_3];
+; CHECK-DAG:  ld.param.b16    [[A:%rs[0-9]+]], [test_select_cc_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_select_cc_param_1];
+; CHECK-DAG:  ld.param.b16    [[C:%rs[0-9]+]], [test_select_cc_param_2];
+; CHECK-DAG:  ld.param.b16    [[D:%rs[0-9]+]], [test_select_cc_param_3];
 ; CHECK-F16-NOFTZ:  setp.neu.f16    [[PRED:%p[0-9]+]], [[C]], [[D]]
 ; CHECK-NOF16-DAG: cvt.f32.f16 [[DF:%f[0-9]+]], [[D]];
 ; CHECK-NOF16-DAG: cvt.f32.f16 [[CF:%f[0-9]+]], [[C]];
 ; CHECK-NOF16: setp.neu.f32    [[PRED:%p[0-9]+]], [[CF]], [[DF]]
-; CHECK:      selp.b16        [[R:%h[0-9]+]], [[A]], [[B]], [[PRED]];
+; CHECK:      selp.b16        [[R:%rs[0-9]+]], [[A]], [[B]], [[PRED]];
 ; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
 ; CHECK-NEXT: ret;
 define half @test_select_cc(half %a, half %b, half %c, half %d) #0 {
@@ -338,8 +338,8 @@ define half @test_select_cc(half %a, half %b, half %c, half %d) #0 {
 ; CHECK-LABEL: test_select_cc_f32_f16(
 ; CHECK-DAG:  ld.param.f32    [[A:%f[0-9]+]], [test_select_cc_f32_f16_param_0];
 ; CHECK-DAG:  ld.param.f32    [[B:%f[0-9]+]], [test_select_cc_f32_f16_param_1];
-; CHECK-DAG:  ld.param.b16    [[C:%h[0-9]+]], [test_select_cc_f32_f16_param_2];
-; CHECK-DAG:  ld.param.b16    [[D:%h[0-9]+]], [test_select_cc_f32_f16_param_3];
+; CHECK-DAG:  ld.param.b16    [[C:%rs[0-9]+]], [test_select_cc_f32_f16_param_2];
+; CHECK-DAG:  ld.param.b16    [[D:%rs[0-9]+]], [test_select_cc_f32_f16_param_3];
 ; CHECK-F16-NOFTZ:  setp.neu.f16    [[PRED:%p[0-9]+]], [[C]], [[D]]
 ; CHECK-F16-FTZ:  setp.neu.ftz.f16    [[PRED:%p[0-9]+]], [[C]], [[D]]
 ; CHECK-NOF16-DAG: cvt.f32.f16 [[DF:%f[0-9]+]], [[D]];
@@ -355,13 +355,13 @@ define float @test_select_cc_f32_f16(float %a, float %b, half %c, half %d) #0 {
 }
 
 ; CHECK-LABEL: test_select_cc_f16_f32(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_select_cc_f16_f32_param_0];
+; CHECK-DAG:  ld.param.b16    [[A:%rs[0-9]+]], [test_select_cc_f16_f32_param_0];
 ; CHECK-DAG:  ld.param.f32    [[C:%f[0-9]+]], [test_select_cc_f16_f32_param_2];
 ; CHECK-DAG:  ld.param.f32    [[D:%f[0-9]+]], [test_select_cc_f16_f32_param_3];
 ; CHECK-NOFTZ-DAG:  setp.neu.f32    [[PRED:%p[0-9]+]], [[C]], [[D]]
 ; CHECK-F16-FTZ-DAG:  setp.neu.ftz.f32    [[PRED:%p[0-9]+]], [[C]], [[D]]
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_select_cc_f16_f32_param_1];
-; CHECK-NEXT: selp.b16        [[R:%h[0-9]+]], [[A]], [[B]], [[PRED]];
+; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_select_cc_f16_f32_param_1];
+; CHECK-NEXT: selp.b16        [[R:%rs[0-9]+]], [[A]], [[B]], [[PRED]];
 ; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
 ; CHECK-NEXT: ret;
 define half @test_select_cc_f16_f32(half %a, half %b, float %c, float %d) #0 {
@@ -371,8 +371,8 @@ define half @test_select_cc_f16_f32(half %a, half %b, float %c, float %d) #0 {
 }
 
 ; CHECK-LABEL: test_fcmp_une(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fcmp_une_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fcmp_une_param_1];
+; CHECK-DAG:  ld.param.b16    [[A:%rs[0-9]+]], [test_fcmp_une_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_fcmp_une_param_1];
 ; CHECK-F16-NOFTZ:  setp.neu.f16    [[PRED:%p[0-9]+]], [[A]], [[B]]
 ; CHECK-F16-FTZ:  setp.neu.ftz.f16    [[PRED:%p[0-9]+]], [[A]], [[B]]
 ; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
@@ -387,8 +387,8 @@ define i1 @test_fcmp_une(half %a, half %b) #0 {
 }
 
 ; CHECK-LABEL: test_fcmp_ueq(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fcmp_ueq_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fcmp_ueq_param_1];
+; CHECK-DAG:  ld.param.b16    [[A:%rs[0-9]+]], [test_fcmp_ueq_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_fcmp_ueq_param_1];
 ; CHECK-F16-NOFTZ:  setp.equ.f16    [[PRED:%p[0-9]+]], [[A]], [[B]]
 ; CHECK-F16-FTZ:  setp.equ.ftz.f16    [[PRED:%p[0-9]+]], [[A]], [[B]]
 ; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
@@ -403,8 +403,8 @@ define i1 @test_fcmp_ueq(half %a, half %b) #0 {
 }
 
 ; CHECK-LABEL: test_fcmp_ugt(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fcmp_ugt_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fcmp_ugt_param_1];
+; CHECK-DAG:  ld.param.b16    [[A:%rs[0-9]+]], [test_fcmp_ugt_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_fcmp_ugt_param_1];
 ; CHECK-F16-NOFTZ:  setp.gtu.f16    [[PRED:%p[0-9]+]], [[A]], [[B]]
 ; CHECK-F16-FTZ:  setp.gtu.ftz.f16    [[PRED:%p[0-9]+]], [[A]], [[B]]
 ; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
@@ -419,8 +419,8 @@ define i1 @test_fcmp_ugt(half %a, half %b) #0 {
 }
 
 ; CHECK-LABEL: test_fcmp_uge(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fcmp_uge_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fcmp_uge_param_1];
+; CHECK-DAG:  ld.param.b16    [[A:%rs[0-9]+]], [test_fcmp_uge_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_fcmp_uge_param_1];
 ; CHECK-F16-NOFTZ:  setp.geu.f16    [[PRED:%p[0-9]+]], [[A]], [[B]]
 ; CHECK-F16-FTZ:  setp.geu.ftz.f16    [[PRED:%p[0-9]+]], [[A]], [[B]]
 ; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
@@ -435,8 +435,8 @@ define i1 @test_fcmp_uge(half %a, half %b) #0 {
 }
 
 ; CHECK-LABEL: test_fcmp_ult(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fcmp_ult_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fcmp_ult_param_1];
+; CHECK-DAG:  ld.param.b16    [[A:%rs[0-9]+]], [test_fcmp_ult_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_fcmp_ult_param_1];
 ; CHECK-F16-NOFTZ:  setp.ltu.f16    [[PRED:%p[0-9]+]], [[A]], [[B]]
 ; CHECK-F16-FTZ:  setp.ltu.ftz.f16    [[PRED:%p[0-9]+]], [[A]], [[B]]
 ; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
@@ -451,8 +451,8 @@ define i1 @test_fcmp_ult(half %a, half %b) #0 {
 }
 
 ; CHECK-LABEL: test_fcmp_ule(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fcmp_ule_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fcmp_ule_param_1];
+; CHECK-DAG:  ld.param.b16    [[A:%rs[0-9]+]], [test_fcmp_ule_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_fcmp_ule_param_1];
 ; CHECK-F16-NOFTZ:  setp.leu.f16    [[PRED:%p[0-9]+]], [[A]], [[B]]
 ; CHECK-F16-FTZ:  setp.leu.ftz.f16    [[PRED:%p[0-9]+]], [[A]], [[B]]
 ; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
@@ -468,8 +468,8 @@ define i1 @test_fcmp_ule(half %a, half %b) #0 {
 
 
 ; CHECK-LABEL: test_fcmp_uno(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fcmp_uno_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fcmp_uno_param_1];
+; CHECK-DAG:  ld.param.b16    [[A:%rs[0-9]+]], [test_fcmp_uno_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_fcmp_uno_param_1];
 ; CHECK-F16-NOFTZ:  setp.nan.f16    [[PRED:%p[0-9]+]], [[A]], [[B]]
 ; CHECK-F16-FTZ:  setp.nan.ftz.f16    [[PRED:%p[0-9]+]], [[A]], [[B]]
 ; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
@@ -484,8 +484,8 @@ define i1 @test_fcmp_uno(half %a, half %b) #0 {
 }
 
 ; CHECK-LABEL: test_fcmp_one(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fcmp_one_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fcmp_one_param_1];
+; CHECK-DAG:  ld.param.b16    [[A:%rs[0-9]+]], [test_fcmp_one_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_fcmp_one_param_1];
 ; CHECK-F16-NOFTZ:  setp.ne.f16     [[PRED:%p[0-9]+]], [[A]], [[B]]
 ; CHECK-F16-FTZ:  setp.ne.ftz.f16     [[PRED:%p[0-9]+]], [[A]], [[B]]
 ; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
@@ -500,8 +500,8 @@ define i1 @test_fcmp_one(half %a, half %b) #0 {
 }
 
 ; CHECK-LABEL: test_fcmp_oeq(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fcmp_oeq_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fcmp_oeq_param_1];
+; CHECK-DAG:  ld.param.b16    [[A:%rs[0-9]+]], [test_fcmp_oeq_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_fcmp_oeq_param_1];
 ; CHECK-F16-NOFTZ:  setp.eq.f16     [[PRED:%p[0-9]+]], [[A]], [[B]]
 ; CHECK-F16-FTZ:  setp.eq.ftz.f16     [[PRED:%p[0-9]+]], [[A]], [[B]]
 ; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
@@ -516,8 +516,8 @@ define i1 @test_fcmp_oeq(half %a, half %b) #0 {
 }
 
 ; CHECK-LABEL: test_fcmp_ogt(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fcmp_ogt_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fcmp_ogt_param_1];
+; CHECK-DAG:  ld.param.b16    [[A:%rs[0-9]+]], [test_fcmp_ogt_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_fcmp_ogt_param_1];
 ; CHECK-F16-NOFTZ:  setp.gt.f16     [[PRED:%p[0-9]+]], [[A]], [[B]]
 ; CHECK-F16-FTZ:  setp.gt.ftz.f16     [[PRED:%p[0-9]+]], [[A]], [[B]]
 ; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
@@ -532,8 +532,8 @@ define i1 @test_fcmp_ogt(half %a, half %b) #0 {
 }
 
 ; CHECK-LABEL: test_fcmp_oge(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fcmp_oge_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fcmp_oge_param_1];
+; CHECK-DAG:  ld.param.b16    [[A:%rs[0-9]+]], [test_fcmp_oge_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_fcmp_oge_param_1];
 ; CHECK-F16-NOFTZ:  setp.ge.f16     [[PRED:%p[0-9]+]], [[A]], [[B]]
 ; CHECK-F16-FTZ:  setp.ge.ftz.f16     [[PRED:%p[0-9]+]], [[A]], [[B]]
 ; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
@@ -548,8 +548,8 @@ define i1 @test_fcmp_oge(half %a, half %b) #0 {
 }
 
 ; XCHECK-LABEL: test_fcmp_olt(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fcmp_olt_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fcmp_olt_param_1];
+; CHECK-DAG:  ld.param.b16    [[A:%rs[0-9]+]], [test_fcmp_olt_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_fcmp_olt_param_1];
 ; CHECK-F16-NOFTZ:  setp.lt.f16     [[PRED:%p[0-9]+]], [[A]], [[B]]
 ; CHECK-F16-FTZ:  setp.lt.ftz.f16     [[PRED:%p[0-9]+]], [[A]], [[B]]
 ; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
@@ -564,8 +564,8 @@ define i1 @test_fcmp_olt(half %a, half %b) #0 {
 }
 
 ; XCHECK-LABEL: test_fcmp_ole(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fcmp_ole_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fcmp_ole_param_1];
+; CHECK-DAG:  ld.param.b16    [[A:%rs[0-9]+]], [test_fcmp_ole_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_fcmp_ole_param_1];
 ; CHECK-F16-NOFTZ:  setp.le.f16     [[PRED:%p[0-9]+]], [[A]], [[B]]
 ; CHECK-F16-FTZ:  setp.le.ftz.f16     [[PRED:%p[0-9]+]], [[A]], [[B]]
 ; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
@@ -580,8 +580,8 @@ define i1 @test_fcmp_ole(half %a, half %b) #0 {
 }
 
 ; CHECK-LABEL: test_fcmp_ord(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fcmp_ord_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fcmp_ord_param_1];
+; CHECK-DAG:  ld.param.b16    [[A:%rs[0-9]+]], [test_fcmp_ord_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_fcmp_ord_param_1];
 ; CHECK-F16-NOFTZ:  setp.num.f16    [[PRED:%p[0-9]+]], [[A]], [[B]]
 ; CHECK-F16-FTZ:  setp.num.ftz.f16    [[PRED:%p[0-9]+]], [[A]], [[B]]
 ; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
@@ -596,8 +596,8 @@ define i1 @test_fcmp_ord(half %a, half %b) #0 {
 }
 
 ; CHECK-LABEL: test_br_cc(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_br_cc_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_br_cc_param_1];
+; CHECK-DAG:  ld.param.b16    [[A:%rs[0-9]+]], [test_br_cc_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_br_cc_param_1];
 ; CHECK-DAG:  ld.param.u64    %[[C:rd[0-9]+]], [test_br_cc_param_2];
 ; CHECK-DAG:  ld.param.u64    %[[D:rd[0-9]+]], [test_br_cc_param_3];
 ; CHECK-F16-NOFTZ:  setp.lt.f16     [[PRED:%p[0-9]+]], [[A]], [[B]]
@@ -623,10 +623,10 @@ else:
 
 ; CHECK-LABEL: test_phi(
 ; CHECK:      ld.param.u64    %[[P1:rd[0-9]+]], [test_phi_param_0];
-; CHECK:      ld.b16  {{%h[0-9]+}}, [%[[P1]]];
+; CHECK:      ld.b16  {{%rs[0-9]+}}, [%[[P1]]];
 ; CHECK: [[LOOP:\$L__BB[0-9_]+]]:
-; CHECK:      mov.b16 [[R:%h[0-9]+]], [[AB:%h[0-9]+]];
-; CHECK:      ld.b16  [[AB:%h[0-9]+]], [%[[P1]]];
+; CHECK:      mov.u16 [[R:%rs[0-9]+]], [[AB:%rs[0-9]+]];
+; CHECK:      ld.b16  [[AB:%rs[0-9]+]], [%[[P1]]];
 ; CHECK:      {
 ; CHECK:      st.param.b64    [param0+0], %[[P1]];
 ; CHECK:      call.uni (retval0),
@@ -651,7 +651,7 @@ return:
 declare i1 @test_dummy(ptr %p1) #0
 
 ; CHECK-LABEL: test_fptosi_i32(
-; CHECK:      ld.param.b16    [[A:%h[0-9]+]], [test_fptosi_i32_param_0];
+; CHECK:      ld.param.b16    [[A:%rs[0-9]+]], [test_fptosi_i32_param_0];
 ; CHECK:      cvt.rzi.s32.f16 [[R:%r[0-9]+]], [[A]];
 ; CHECK:      st.param.b32    [func_retval0+0], [[R]];
 ; CHECK:      ret;
@@ -661,7 +661,7 @@ define i32 @test_fptosi_i32(half %a) #0 {
 }
 
 ; CHECK-LABEL: test_fptosi_i64(
-; CHECK:      ld.param.b16    [[A:%h[0-9]+]], [test_fptosi_i64_param_0];
+; CHECK:      ld.param.b16    [[A:%rs[0-9]+]], [test_fptosi_i64_param_0];
 ; CHECK:      cvt.rzi.s64.f16 [[R:%rd[0-9]+]], [[A]];
 ; CHECK:      st.param.b64    [func_retval0+0], [[R]];
 ; CHECK:      ret;
@@ -671,7 +671,7 @@ define i64 @test_fptosi_i64(half %a) #0 {
 }
 
 ; CHECK-LABEL: test_fptoui_i32(
-; CHECK:      ld.param.b16    [[A:%h[0-9]+]], [test_fptoui_i32_param_0];
+; CHECK:      ld.param.b16    [[A:%rs[0-9]+]], [test_fptoui_i32_param_0];
 ; CHECK:      cvt.rzi.u32.f16 [[R:%r[0-9]+]], [[A]];
 ; CHECK:      st.param.b32    [func_retval0+0], [[R]];
 ; CHECK:      ret;
@@ -681,7 +681,7 @@ define i32 @test_fptoui_i32(half %a) #0 {
 }
 
 ; CHECK-LABEL: test_fptoui_i64(
-; CHECK:      ld.param.b16    [[A:%h[0-9]+]], [test_fptoui_i64_param_0];
+; CHECK:      ld.param.b16    [[A:%rs[0-9]+]], [test_fptoui_i64_param_0];
 ; CHECK:      cvt.rzi.u64.f16 [[R:%rd[0-9]+]], [[A]];
 ; CHECK:      st.param.b64    [func_retval0+0], [[R]];
 ; CHECK:      ret;
@@ -692,7 +692,7 @@ define i64 @test_fptoui_i64(half %a) #0 {
 
 ; CHECK-LABEL: test_uitofp_i32(
 ; CHECK:      ld.param.u32    [[A:%r[0-9]+]], [test_uitofp_i32_param_0];
-; CHECK:      cvt.rn.f16.u32  [[R:%h[0-9]+]], [[A]];
+; CHECK:      cvt.rn.f16.u32  [[R:%rs[0-9]+]], [[A]];
 ; CHECK:      st.param.b16    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define half @test_uitofp_i32(i32 %a) #0 {
@@ -702,7 +702,7 @@ define half @test_uitofp_i32(i32 %a) #0 {
 
 ; CHECK-LABEL: test_uitofp_i64(
 ; CHECK:      ld.param.u64    [[A:%rd[0-9]+]], [test_uitofp_i64_param_0];
-; CHECK:      cvt.rn.f16.u64  [[R:%h[0-9]+]], [[A]];
+; CHECK:      cvt.rn.f16.u64  [[R:%rs[0-9]+]], [[A]];
 ; CHECK:      st.param.b16    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define half @test_uitofp_i64(i64 %a) #0 {
@@ -712,7 +712,7 @@ define half @test_uitofp_i64(i64 %a) #0 {
 
 ; CHECK-LABEL: test_sitofp_i32(
 ; CHECK:      ld.param.u32    [[A:%r[0-9]+]], [test_sitofp_i32_param_0];
-; CHECK:      cvt.rn.f16.s32  [[R:%h[0-9]+]], [[A]];
+; CHECK:      cvt.rn.f16.s32  [[R:%rs[0-9]+]], [[A]];
 ; CHECK:      st.param.b16    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define half @test_sitofp_i32(i32 %a) #0 {
@@ -722,7 +722,7 @@ define half @test_sitofp_i32(i32 %a) #0 {
 
 ; CHECK-LABEL: test_sitofp_i64(
 ; CHECK:      ld.param.u64    [[A:%rd[0-9]+]], [test_sitofp_i64_param_0];
-; CHECK:      cvt.rn.f16.s64  [[R:%h[0-9]+]], [[A]];
+; CHECK:      cvt.rn.f16.s64  [[R:%rs[0-9]+]], [[A]];
 ; CHECK:      st.param.b16    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define half @test_sitofp_i64(i64 %a) #0 {
@@ -732,14 +732,14 @@ define half @test_sitofp_i64(i64 %a) #0 {
 
 ; CHECK-LABEL: test_uitofp_i32_fadd(
 ; CHECK-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_uitofp_i32_fadd_param_0];
-; CHECK-DAG:  cvt.rn.f16.u32  [[C:%h[0-9]+]], [[A]];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_uitofp_i32_fadd_param_1];
-; CHECK-F16-NOFTZ:       add.rn.f16      [[R:%h[0-9]+]], [[B]], [[C]];
-; CHECK-F16-FTZ:       add.rn.ftz.f16      [[R:%h[0-9]+]], [[B]], [[C]];
+; CHECK-DAG:  cvt.rn.f16.u32  [[C:%rs[0-9]+]], [[A]];
+; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_uitofp_i32_fadd_param_1];
+; CHECK-F16-NOFTZ:       add.rn.f16      [[R:%rs[0-9]+]], [[B]], [[C]];
+; CHECK-F16-FTZ:       add.rn.ftz.f16      [[R:%rs[0-9]+]], [[B]], [[C]];
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%f[0-9]+]], [[B]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[C32:%f[0-9]+]], [[C]]
 ; CHECK-NOF16-NEXT: add.rn.f32     [[R32:%f[0-9]+]], [[B32]], [[C32]];
-; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[R32]]
 ; CHECK:      st.param.b16    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define half @test_uitofp_i32_fadd(i32 %a, half %b) #0 {
@@ -750,14 +750,14 @@ define half @test_uitofp_i32_fadd(i32 %a, half %b) #0 {
 
 ; CHECK-LABEL: test_sitofp_i32_fadd(
 ; CHECK-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_sitofp_i32_fadd_param_0];
-; CHECK-DAG:  cvt.rn.f16.s32  [[C:%h[0-9]+]], [[A]];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_sitofp_i32_fadd_param_1];
-; CHECK-F16-NOFTZ:         add.rn.f16     [[R:%h[0-9]+]], [[B]], [[C]];
-; CHECK-F16-FTZ:         add.rn.ftz.f16     [[R:%h[0-9]+]], [[B]], [[C]];
+; CHECK-DAG:  cvt.rn.f16.s32  [[C:%rs[0-9]+]], [[A]];
+; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_sitofp_i32_fadd_param_1];
+; CHECK-F16-NOFTZ:         add.rn.f16     [[R:%rs[0-9]+]], [[B]], [[C]];
+; CHECK-F16-FTZ:         add.rn.ftz.f16     [[R:%rs[0-9]+]], [[B]], [[C]];
 ; XCHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%f[0-9]+]], [[B]]
 ; XCHECK-NOF16-DAG:  cvt.f32.f16    [[C32:%f[0-9]+]], [[C]]
 ; XCHECK-NOF16-NEXT: add.rn.f32     [[R32:%f[0-9]+]], [[B32]], [[C32]];
-; XCHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
+; XCHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[R32]]
 ; CHECK:      st.param.b16    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define half @test_sitofp_i32_fadd(i32 %a, half %b) #0 {
@@ -768,7 +768,7 @@ define half @test_sitofp_i32_fadd(i32 %a, half %b) #0 {
 
 ; CHECK-LABEL: test_fptrunc_float(
 ; CHECK:      ld.param.f32    [[A:%f[0-9]+]], [test_fptrunc_float_param_0];
-; CHECK:      cvt.rn.f16.f32  [[R:%h[0-9]+]], [[A]];
+; CHECK:      cvt.rn.f16.f32  [[R:%rs[0-9]+]], [[A]];
 ; CHECK:      st.param.b16    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define half @test_fptrunc_float(float %a) #0 {
@@ -778,7 +778,7 @@ define half @test_fptrunc_float(float %a) #0 {
 
 ; CHECK-LABEL: test_fptrunc_double(
 ; CHECK:      ld.param.f64    [[A:%fd[0-9]+]], [test_fptrunc_double_param_0];
-; CHECK:      cvt.rn.f16.f64  [[R:%h[0-9]+]], [[A]];
+; CHECK:      cvt.rn.f16.f64  [[R:%rs[0-9]+]], [[A]];
 ; CHECK:      st.param.b16    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define half @test_fptrunc_double(double %a) #0 {
@@ -787,7 +787,7 @@ define half @test_fptrunc_double(double %a) #0 {
 }
 
 ; CHECK-LABEL: test_fpext_float(
-; CHECK:      ld.param.b16    [[A:%h[0-9]+]], [test_fpext_float_param_0];
+; CHECK:      ld.param.b16    [[A:%rs[0-9]+]], [test_fpext_float_param_0];
 ; CHECK-NOFTZ:      cvt.f32.f16     [[R:%f[0-9]+]], [[A]];
 ; CHECK-F16-FTZ:      cvt.ftz.f32.f16     [[R:%f[0-9]+]], [[A]];
 ; CHECK:      st.param.f32    [func_retval0+0], [[R]];
@@ -798,7 +798,7 @@ define float @test_fpext_float(half %a) #0 {
 }
 
 ; CHECK-LABEL: test_fpext_double(
-; CHECK:      ld.param.b16    [[A:%h[0-9]+]], [test_fpext_double_param_0];
+; CHECK:      ld.param.b16    [[A:%rs[0-9]+]], [test_fpext_double_param_0];
 ; CHECK:      cvt.f64.f16     [[R:%fd[0-9]+]], [[A]];
 ; CHECK:      st.param.f64    [func_retval0+0], [[R]];
 ; CHECK:      ret;
@@ -809,9 +809,8 @@ define double @test_fpext_double(half %a) #0 {
 
 
 ; CHECK-LABEL: test_bitcast_halftoi16(
-; CHECK:      ld.param.b16    [[AH:%h[0-9]+]], [test_bitcast_halftoi16_param_0];
-; CHECK:      mov.b16         [[AS:%rs[0-9]+]], [[AH]]
-; CHECK:      cvt.u32.u16     [[R:%r[0-9]+]], [[AS]]
+; CHECK:      ld.param.b16    [[AH:%rs[0-9]+]], [test_bitcast_halftoi16_param_0];
+; CHECK:      cvt.u32.u16     [[R:%r[0-9]+]], [[AH]]
 ; CHECK:      st.param.b32    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define i16 @test_bitcast_halftoi16(half %a) #0 {
@@ -821,8 +820,7 @@ define i16 @test_bitcast_halftoi16(half %a) #0 {
 
 ; CHECK-LABEL: test_bitcast_i16tohalf(
 ; CHECK:      ld.param.u16    [[AS:%rs[0-9]+]], [test_bitcast_i16tohalf_param_0];
-; CHECK:      mov.b16         [[AH:%h[0-9]+]], [[AS]]
-; CHECK:      st.param.b16    [func_retval0+0], [[AH]];
+; CHECK:      st.param.b16    [func_retval0+0], [[AS]];
 ; CHECK:      ret;
 define half @test_bitcast_i16tohalf(i16 %a) #0 {
   %r = bitcast i16 %a to half
@@ -855,12 +853,12 @@ declare half @llvm.roundeven.f16(half %a) #0
 declare half @llvm.fmuladd.f16(half %a, half %b, half %c) #0
 
 ; CHECK-LABEL: test_sqrt(
-; CHECK:      ld.param.b16    [[A:%h[0-9]+]], [test_sqrt_param_0];
+; CHECK:      ld.param.b16    [[A:%rs[0-9]+]], [test_sqrt_param_0];
 ; CHECK-NOFTZ:      cvt.f32.f16     [[AF:%f[0-9]+]], [[A]];
 ; CHECK-NOFTZ:      sqrt.rn.f32     [[RF:%f[0-9]+]], [[AF]];
 ; CHECK-F16-FTZ:      cvt.ftz.f32.f16     [[AF:%f[0-9]+]], [[A]];
 ; CHECK-F16-FTZ:      sqrt.rn.ftz.f32     [[RF:%f[0-9]+]], [[AF]];
-; CHECK:      cvt.rn.f16.f32  [[R:%h[0-9]+]], [[RF]];
+; CHECK:      cvt.rn.f16.f32  [[R:%rs[0-9]+]], [[RF]];
 ; CHECK:      st.param.b16    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define half @test_sqrt(half %a) #0 {
@@ -876,11 +874,11 @@ define half @test_sqrt(half %a) #0 {
 ;}
 
 ; CHECK-LABEL: test_sin(
-; CHECK:      ld.param.b16    [[A:%h[0-9]+]], [test_sin_param_0];
+; CHECK:      ld.param.b16    [[A:%rs[0-9]+]], [test_sin_param_0];
 ; CHECK-NOFTZ:      cvt.f32.f16     [[AF:%f[0-9]+]], [[A]];
 ; CHECK-F16-FTZ:      cvt.ftz.f32.f16     [[AF:%f[0-9]+]], [[A]];
 ; CHECK:      sin.approx.f32  [[RF:%f[0-9]+]], [[AF]];
-; CHECK:      cvt.rn.f16.f32  [[R:%h[0-9]+]], [[RF]];
+; CHECK:      cvt.rn.f16.f32  [[R:%rs[0-9]+]], [[RF]];
 ; CHECK:      st.param.b16    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define half @test_sin(half %a) #0 #1 {
@@ -889,11 +887,11 @@ define half @test_sin(half %a) #0 #1 {
 }
 
 ; CHECK-LABEL: test_cos(
-; CHECK:      ld.param.b16    [[A:%h[0-9]+]], [test_cos_param_0];
+; CHECK:      ld.param.b16    [[A:%rs[0-9]+]], [test_cos_param_0];
 ; CHECK-NOFTZ:      cvt.f32.f16     [[AF:%f[0-9]+]], [[A]];
 ; CHECK-F16-FTZ:      cvt.ftz.f32.f16     [[AF:%f[0-9]+]], [[A]];
 ; CHECK:      cos.approx.f32  [[RF:%f[0-9]+]], [[AF]];
-; CHECK:      cvt.rn.f16.f32  [[R:%h[0-9]+]], [[RF]];
+; CHECK:      cvt.rn.f16.f32  [[R:%rs[0-9]+]], [[RF]];
 ; CHECK:      st.param.b16    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define half @test_cos(half %a) #0 #1 {
@@ -944,16 +942,16 @@ define half @test_cos(half %a) #0 #1 {
 ;}
 
 ; CHECK-LABEL: test_fma(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fma_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fma_param_1];
-; CHECK-DAG:  ld.param.b16    [[C:%h[0-9]+]], [test_fma_param_2];
-; CHECK-F16-NOFTZ:      fma.rn.f16      [[R:%h[0-9]+]], [[A]], [[B]], [[C]];
-; CHECK-F16-FTZ:      fma.rn.ftz.f16      [[R:%h[0-9]+]], [[A]], [[B]], [[C]];
+; CHECK-DAG:  ld.param.b16    [[A:%rs[0-9]+]], [test_fma_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_fma_param_1];
+; CHECK-DAG:  ld.param.b16    [[C:%rs[0-9]+]], [test_fma_param_2];
+; CHECK-F16-NOFTZ:      fma.rn.f16      [[R:%rs[0-9]+]], [[A]], [[B]], [[C]];
+; CHECK-F16-FTZ:      fma.rn.ftz.f16      [[R:%rs[0-9]+]], [[A]], [[B]], [[C]];
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[A32:%f[0-9]+]], [[A]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%f[0-9]+]], [[B]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[C32:%f[0-9]+]], [[C]]
 ; CHECK-NOF16-NEXT: fma.rn.f32     [[R32:%f[0-9]+]], [[A32]], [[B32]], [[C32]];
-; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[R32]]
 ; CHECK:      st.param.b16    [func_retval0+0], [[R]];
 ; CHECK:      ret
 define half @test_fma(half %a, half %b, half %c) #0 {
@@ -962,12 +960,12 @@ define half @test_fma(half %a, half %b, half %c) #0 {
 }
 
 ; CHECK-LABEL: test_fabs(
-; CHECK:      ld.param.b16    [[A:%h[0-9]+]], [test_fabs_param_0];
+; CHECK:      ld.param.b16    [[A:%rs[0-9]+]], [test_fabs_param_0];
 ; CHECK-NOFTZ:      cvt.f32.f16     [[AF:%f[0-9]+]], [[A]];
 ; CHECK-NOFTZ:      abs.f32         [[RF:%f[0-9]+]], [[AF]];
 ; CHECK-F16-FTZ:      cvt.ftz.f32.f16     [[AF:%f[0-9]+]], [[A]];
 ; CHECK-F16-FTZ:      abs.ftz.f32         [[RF:%f[0-9]+]], [[AF]];
-; CHECK:      cvt.rn.f16.f32  [[R:%h[0-9]+]], [[RF]];
+; CHECK:      cvt.rn.f16.f32  [[R:%rs[0-9]+]], [[RF]];
 ; CHECK:      st.param.b16    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define half @test_fabs(half %a) #0 {
@@ -976,15 +974,15 @@ define half @test_fabs(half %a) #0 {
 }
 
 ; CHECK-LABEL: test_minnum(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_minnum_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_minnum_param_1];
+; CHECK-DAG:  ld.param.b16    [[A:%rs[0-9]+]], [test_minnum_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_minnum_param_1];
 ; CHECK-NOFTZ-DAG:  cvt.f32.f16     [[AF:%f[0-9]+]], [[A]];
 ; CHECK-NOFTZ-DAG:  cvt.f32.f16     [[BF:%f[0-9]+]], [[B]];
 ; CHECK-NOFTZ:      min.f32         [[RF:%f[0-9]+]], [[AF]], [[BF]];
 ; CHECK-F16-FTZ-DAG:  cvt.ftz.f32.f16     [[AF:%f[0-9]+]], [[A]];
 ; CHECK-F16-FTZ-DAG:  cvt.ftz.f32.f16     [[BF:%f[0-9]+]], [[B]];
 ; CHECK-F16-FTZ:      min.ftz.f32         [[RF:%f[0-9]+]], [[AF]], [[BF]];
-; CHECK:      cvt.rn.f16.f32  [[R:%h[0-9]+]], [[RF]];
+; CHECK:      cvt.rn.f16.f32  [[R:%rs[0-9]+]], [[RF]];
 ; CHECK:      st.param.b16    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define half @test_minnum(half %a, half %b) #0 {
@@ -993,15 +991,15 @@ define half @test_minnum(half %a, half %b) #0 {
 }
 
 ; CHECK-LABEL: test_maxnum(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_maxnum_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_maxnum_param_1];
+; CHECK-DAG:  ld.param.b16    [[A:%rs[0-9]+]], [test_maxnum_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_maxnum_param_1];
 ; CHECK-NOFTZ-DAG:  cvt.f32.f16     [[AF:%f[0-9]+]], [[A]];
 ; CHECK-NOFTZ-DAG:  cvt.f32.f16     [[BF:%f[0-9]+]], [[B]];
 ; CHECK-NOFTZ:      max.f32         [[RF:%f[0-9]+]], [[AF]], [[BF]];
 ; CHECK-F16-FTZ-DAG:  cvt.ftz.f32.f16     [[AF:%f[0-9]+]], [[A]];
 ; CHECK-F16-FTZ-DAG:  cvt.ftz.f32.f16     [[BF:%f[0-9]+]], [[B]];
 ; CHECK-F16-FTZ:      max.ftz.f32         [[RF:%f[0-9]+]], [[AF]], [[BF]];
-; CHECK:      cvt.rn.f16.f32  [[R:%h[0-9]+]], [[RF]];
+; CHECK:      cvt.rn.f16.f32  [[R:%rs[0-9]+]], [[RF]];
 ; CHECK:      st.param.b16    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define half @test_maxnum(half %a, half %b) #0 {
@@ -1010,15 +1008,12 @@ define half @test_maxnum(half %a, half %b) #0 {
 }
 
 ; CHECK-LABEL: test_copysign(
-; CHECK-DAG:  ld.param.b16    [[AH:%h[0-9]+]], [test_copysign_param_0];
-; CHECK-DAG:  ld.param.b16    [[BH:%h[0-9]+]], [test_copysign_param_1];
-; CHECK-DAG:  mov.b16         [[AS:%rs[0-9]+]], [[AH]];
-; CHECK-DAG:  mov.b16         [[BS:%rs[0-9]+]], [[BH]];
-; CHECK-DAG:  and.b16         [[AX:%rs[0-9]+]], [[AS]], 32767;
-; CHECK-DAG:  and.b16         [[BX:%rs[0-9]+]], [[BS]], -32768;
+; CHECK-DAG:  ld.param.b16    [[AH:%rs[0-9]+]], [test_copysign_param_0];
+; CHECK-DAG:  ld.param.b16    [[BH:%rs[0-9]+]], [test_copysign_param_1];
+; CHECK-DAG:  and.b16         [[AX:%rs[0-9]+]], [[AH]], 32767;
+; CHECK-DAG:  and.b16         [[BX:%rs[0-9]+]], [[BH]], -32768;
 ; CHECK:      or.b16          [[RX:%rs[0-9]+]], [[AX]], [[BX]];
-; CHECK:      mov.b16         [[R:%h[0-9]+]], [[RX]];
-; CHECK:      st.param.b16    [func_retval0+0], [[R]];
+; CHECK:      st.param.b16    [func_retval0+0], [[RX]];
 ; CHECK:      ret;
 define half @test_copysign(half %a, half %b) #0 {
   %r = call half @llvm.copysign.f16(half %a, half %b)
@@ -1026,16 +1021,14 @@ define half @test_copysign(half %a, half %b) #0 {
 }
 
 ; CHECK-LABEL: test_copysign_f32(
-; CHECK-DAG:  ld.param.b16    [[AH:%h[0-9]+]], [test_copysign_f32_param_0];
+; CHECK-DAG:  ld.param.b16    [[AH:%rs[0-9]+]], [test_copysign_f32_param_0];
 ; CHECK-DAG:  ld.param.f32    [[BF:%f[0-9]+]], [test_copysign_f32_param_1];
-; CHECK-DAG:  mov.b16         [[A:%rs[0-9]+]], [[AH]];
 ; CHECK-DAG:  mov.b32         [[B:%r[0-9]+]], [[BF]];
-; CHECK-DAG:  and.b16         [[AX:%rs[0-9]+]], [[A]], 32767;
+; CHECK-DAG:  and.b16         [[AX:%rs[0-9]+]], [[AH]], 32767;
 ; CHECK-DAG:  and.b32         [[BX0:%r[0-9]+]], [[B]], -2147483648;
 ; CHECK-DAG:  mov.b32         {tmp, [[BX2:%rs[0-9]+]]}, [[BX0]];
 ; CHECK:      or.b16          [[RX:%rs[0-9]+]], [[AX]], [[BX2]];
-; CHECK:      mov.b16         [[R:%h[0-9]+]], [[RX]];
-; CHECK:      st.param.b16    [func_retval0+0], [[R]];
+; CHECK:      st.param.b16    [func_retval0+0], [[RX]];
 ; CHECK:      ret;
 define half @test_copysign_f32(half %a, float %b) #0 {
   %tb = fptrunc float %b to half
@@ -1044,17 +1037,15 @@ define half @test_copysign_f32(half %a, float %b) #0 {
 }
 
 ; CHECK-LABEL: test_copysign_f64(
-; CHECK-DAG:  ld.param.b16    [[AH:%h[0-9]+]], [test_copysign_f64_param_0];
+; CHECK-DAG:  ld.param.b16    [[AH:%rs[0-9]+]], [test_copysign_f64_param_0];
 ; CHECK-DAG:  ld.param.f64    [[BD:%fd[0-9]+]], [test_copysign_f64_param_1];
-; CHECK-DAG:  mov.b16         [[A:%rs[0-9]+]], [[AH]];
 ; CHECK-DAG:  mov.b64         [[B:%rd[0-9]+]], [[BD]];
-; CHECK-DAG:  and.b16         [[AX:%rs[0-9]+]], [[A]], 32767;
+; CHECK-DAG:  and.b16         [[AX:%rs[0-9]+]], [[AH]], 32767;
 ; CHECK-DAG:  and.b64         [[BX0:%rd[0-9]+]], [[B]], -9223372036854775808;
 ; CHECK-DAG:  shr.u64         [[BX1:%rd[0-9]+]], [[BX0]], 48;
 ; CHECK-DAG:  cvt.u16.u64     [[BX2:%rs[0-9]+]], [[BX1]];
 ; CHECK:      or.b16          [[RX:%rs[0-9]+]], [[AX]], [[BX2]];
-; CHECK:      mov.b16         [[R:%h[0-9]+]], [[RX]];
-; CHECK:      st.param.b16    [func_retval0+0], [[R]];
+; CHECK:      st.param.b16    [func_retval0+0], [[RX]];
 ; CHECK:      ret;
 define half @test_copysign_f64(half %a, double %b) #0 {
   %tb = fptrunc double %b to half
@@ -1063,16 +1054,13 @@ define half @test_copysign_f64(half %a, double %b) #0 {
 }
 
 ; CHECK-LABEL: test_copysign_extended(
-; CHECK-DAG:  ld.param.b16    [[AH:%h[0-9]+]], [test_copysign_extended_param_0];
-; CHECK-DAG:  ld.param.b16    [[BH:%h[0-9]+]], [test_copysign_extended_param_1];
-; CHECK-DAG:  mov.b16         [[AS:%rs[0-9]+]], [[AH]];
-; CHECK-DAG:  mov.b16         [[BS:%rs[0-9]+]], [[BH]];
-; CHECK-DAG:  and.b16         [[AX:%rs[0-9]+]], [[AS]], 32767;
-; CHECK-DAG:  and.b16         [[BX:%rs[0-9]+]], [[BS]], -32768;
+; CHECK-DAG:  ld.param.b16    [[AH:%rs[0-9]+]], [test_copysign_extended_param_0];
+; CHECK-DAG:  ld.param.b16    [[BH:%rs[0-9]+]], [test_copysign_extended_param_1];
+; CHECK-DAG:  and.b16         [[AX:%rs[0-9]+]], [[AH]], 32767;
+; CHECK-DAG:  and.b16         [[BX:%rs[0-9]+]], [[BH]], -32768;
 ; CHECK:      or.b16          [[RX:%rs[0-9]+]], [[AX]], [[BX]];
-; CHECK:      mov.b16         [[R:%h[0-9]+]], [[RX]];
-; CHECK-NOFTZ: cvt.f32.f16     [[XR:%f[0-9]+]], [[R]];
-; CHECK-F16-FTZ:   cvt.ftz.f32.f16 [[XR:%f[0-9]+]], [[R]];
+; CHECK-NOFTZ: cvt.f32.f16     [[XR:%f[0-9]+]], [[RX]];
+; CHECK-F16-FTZ:   cvt.ftz.f32.f16 [[XR:%f[0-9]+]], [[RX]];
 ; CHECK:      st.param.f32    [func_retval0+0], [[XR]];
 ; CHECK:      ret;
 define float @test_copysign_extended(half %a, half %b) #0 {
@@ -1082,8 +1070,8 @@ define float @test_copysign_extended(half %a, half %b) #0 {
 }
 
 ; CHECK-LABEL: test_floor(
-; CHECK:      ld.param.b16    [[A:%h[0-9]+]], [test_floor_param_0];
-; CHECK:      cvt.rmi.f16.f16 [[R:%h[0-9]+]], [[A]];
+; CHECK:      ld.param.b16    [[A:%rs[0-9]+]], [test_floor_param_0];
+; CHECK:      cvt.rmi.f16.f16 [[R:%rs[0-9]+]], [[A]];
 ; CHECK:      st.param.b16    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define half @test_floor(half %a) #0 {
@@ -1092,8 +1080,8 @@ define half @test_floor(half %a) #0 {
 }
 
 ; CHECK-LABEL: test_ceil(
-; CHECK:      ld.param.b16    [[A:%h[0-9]+]], [test_ceil_param_0];
-; CHECK:      cvt.rpi.f16.f16 [[R:%h[0-9]+]], [[A]];
+; CHECK:      ld.param.b16    [[A:%rs[0-9]+]], [test_ceil_param_0];
+; CHECK:      cvt.rpi.f16.f16 [[R:%rs[0-9]+]], [[A]];
 ; CHECK:      st.param.b16    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define half @test_ceil(half %a) #0 {
@@ -1102,8 +1090,8 @@ define half @test_ceil(half %a) #0 {
 }
 
 ; CHECK-LABEL: test_trunc(
-; CHECK:      ld.param.b16    [[A:%h[0-9]+]], [test_trunc_param_0];
-; CHECK:      cvt.rzi.f16.f16 [[R:%h[0-9]+]], [[A]];
+; CHECK:      ld.param.b16    [[A:%rs[0-9]+]], [test_trunc_param_0];
+; CHECK:      cvt.rzi.f16.f16 [[R:%rs[0-9]+]], [[A]];
 ; CHECK:      st.param.b16    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define half @test_trunc(half %a) #0 {
@@ -1112,8 +1100,8 @@ define half @test_trunc(half %a) #0 {
 }
 
 ; CHECK-LABEL: test_rint(
-; CHECK:      ld.param.b16    [[A:%h[0-9]+]], [test_rint_param_0];
-; CHECK:      cvt.rni.f16.f16 [[R:%h[0-9]+]], [[A]];
+; CHECK:      ld.param.b16    [[A:%rs[0-9]+]], [test_rint_param_0];
+; CHECK:      cvt.rni.f16.f16 [[R:%rs[0-9]+]], [[A]];
 ; CHECK:      st.param.b16    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define half @test_rint(half %a) #0 {
@@ -1122,8 +1110,8 @@ define half @test_rint(half %a) #0 {
 }
 
 ; CHECK-LABEL: test_nearbyint(
-; CHECK:      ld.param.b16    [[A:%h[0-9]+]], [test_nearbyint_param_0];
-; CHECK:      cvt.rni.f16.f16 [[R:%h[0-9]+]], [[A]];
+; CHECK:      ld.param.b16    [[A:%rs[0-9]+]], [test_nearbyint_param_0];
+; CHECK:      cvt.rni.f16.f16 [[R:%rs[0-9]+]], [[A]];
 ; CHECK:      st.param.b16    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define half @test_nearbyint(half %a) #0 {
@@ -1132,8 +1120,8 @@ define half @test_nearbyint(half %a) #0 {
 }
 
 ; CHECK-LABEL: test_roundeven(
-; CHECK:      ld.param.b16    [[A:%h[0-9]+]], [test_roundeven_param_0];
-; CHECK:      cvt.rni.f16.f16 [[R:%h[0-9]+]], [[A]];
+; CHECK:      ld.param.b16    [[A:%rs[0-9]+]], [test_roundeven_param_0];
+; CHECK:      cvt.rni.f16.f16 [[R:%rs[0-9]+]], [[A]];
 ; CHECK:      st.param.b16    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define half @test_roundeven(half %a) #0 {
@@ -1154,16 +1142,16 @@ define half @test_round(half %a) #0 {
 }
 
 ; CHECK-LABEL: test_fmuladd(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fmuladd_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fmuladd_param_1];
-; CHECK-DAG:  ld.param.b16    [[C:%h[0-9]+]], [test_fmuladd_param_2];
-; CHECK-F16-NOFTZ:        fma.rn.f16     [[R:%h[0-9]+]], [[A]], [[B]], [[C]];
-; CHECK-F16-FTZ:        fma.rn.ftz.f16     [[R:%h[0-9]+]], [[A]], [[B]], [[C]];
+; CHECK-DAG:  ld.param.b16    [[A:%rs[0-9]+]], [test_fmuladd_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_fmuladd_param_1];
+; CHECK-DAG:  ld.param.b16    [[C:%rs[0-9]+]], [test_fmuladd_param_2];
+; CHECK-F16-NOFTZ:        fma.rn.f16     [[R:%rs[0-9]+]], [[A]], [[B]], [[C]];
+; CHECK-F16-FTZ:        fma.rn.ftz.f16     [[R:%rs[0-9]+]], [[A]], [[B]], [[C]];
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[A32:%f[0-9]+]], [[A]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%f[0-9]+]], [[B]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[C32:%f[0-9]+]], [[C]]
 ; CHECK-NOF16-NEXT: fma.rn.f32     [[R32:%f[0-9]+]], [[A32]], [[B32]], [[C32]];
-; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[R32]]
 ; CHECK:      st.param.b16    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define half @test_fmuladd(half %a, half %b, half %c) #0 {

diff  --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
index 6fd7261f20a1f..d0e2ef4f6a540 100644
--- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
@@ -31,8 +31,7 @@
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 
 ; CHECK-LABEL: test_ret_const(
-; CHECK:     mov.u32         [[T:%r[0-9+]]], 1073757184;
-; CHECK:     mov.b32         [[R:%hh[0-9+]]], [[T]];
+; CHECK:     mov.b32         [[R:%r[0-9+]]], 1073757184;
 ; CHECK:     st.param.b32    [func_retval0+0], [[R]];
 ; CHECK-NEXT: ret;
 define <2 x half> @test_ret_const() #0 {
@@ -40,8 +39,8 @@ define <2 x half> @test_ret_const() #0 {
 }
 
 ; CHECK-LABEL: test_extract_0(
-; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_extract_0_param_0];
-; CHECK:      mov.b32         {[[R:%h[0-9]+]], %tmp_hi}, [[A]];
+; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_extract_0_param_0];
+; CHECK:      mov.b32         {[[R:%rs[0-9]+]], %tmp_hi}, [[A]];
 ; CHECK:      st.param.b16    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define half @test_extract_0(<2 x half> %a) #0 {
@@ -50,8 +49,8 @@ define half @test_extract_0(<2 x half> %a) #0 {
 }
 
 ; CHECK-LABEL: test_extract_1(
-; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_extract_1_param_0];
-; CHECK:      mov.b32         {%tmp_lo, [[R:%h[0-9]+]]}, [[A]];
+; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_extract_1_param_0];
+; CHECK:      mov.b32         {%tmp_lo, [[R:%rs[0-9]+]]}, [[A]];
 ; CHECK:      st.param.b16    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define half @test_extract_1(<2 x half> %a) #0 {
@@ -60,11 +59,11 @@ define half @test_extract_1(<2 x half> %a) #0 {
 }
 
 ; CHECK-LABEL: test_extract_i(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_extract_i_param_0];
+; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_extract_i_param_0];
 ; CHECK-DAG:  ld.param.u64    [[IDX:%rd[0-9]+]], [test_extract_i_param_1];
 ; CHECK-DAG:  setp.eq.s64     [[PRED:%p[0-9]+]], [[IDX]], 0;
-; CHECK-DAG:  mov.b32         {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[A]];
-; CHECK:      selp.b16        [[R:%h[0-9]+]], [[E0]], [[E1]], [[PRED]];
+; CHECK-DAG:  mov.b32         {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [[A]];
+; CHECK:      selp.b16        [[R:%rs[0-9]+]], [[E0]], [[E1]], [[PRED]];
 ; CHECK:      st.param.b16    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define half @test_extract_i(<2 x half> %a, i64 %idx) #0 {
@@ -73,22 +72,22 @@ define half @test_extract_i(<2 x half> %a, i64 %idx) #0 {
 }
 
 ; CHECK-LABEL: test_fadd(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fadd_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fadd_param_1];
+; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fadd_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fadd_param_1];
 ;
-; CHECK-F16-NEXT:   add.rn.f16x2   [[R:%hh[0-9]+]], [[A]], [[B]];
+; CHECK-F16-NEXT:   add.rn.f16x2   [[R:%r[0-9]+]], [[A]], [[B]];
 ;
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
 ; CHECK-NOF16-DAG:  add.rn.f32     [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
 ; CHECK-NOF16-DAG:  add.rn.f32     [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
-; CHECK-NOF16:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[FR1]]
+; CHECK-NOF16:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
 ;
 ; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
 ; CHECK-NEXT: ret;
@@ -99,20 +98,19 @@ define <2 x half> @test_fadd(<2 x half> %a, <2 x half> %b) #0 {
 
 ; Check that we can lower fadd with immediate arguments.
 ; CHECK-LABEL: test_fadd_imm_0(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fadd_imm_0_param_0];
+; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fadd_imm_0_param_0];
 ;
-; CHECK-F16:        mov.u32        [[I:%r[0-9+]]], 1073757184;
-; CHECK-F16:        mov.b32        [[IHH:%hh[0-9+]]], [[I]];
-; CHECK-F16:        add.rn.f16x2   [[R:%hh[0-9]+]], [[A]], [[IHH]];
+; CHECK-F16:        mov.b32        [[I:%r[0-9+]]], 1073757184;
+; CHECK-F16:        add.rn.f16x2   [[R:%r[0-9]+]], [[A]], [[I]];
 ;
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
 ; CHECK-NOF16-DAG:  add.rn.f32     [[FR0:%f[0-9]+]], [[FA0]], 0f3F800000;
 ; CHECK-NOF16-DAG:  add.rn.f32     [[FR1:%f[0-9]+]], [[FA1]], 0f40000000;
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
-; CHECK-NOF16:      mov.b32        [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[FR1]]
+; CHECK-NOF16:      mov.b32        [[R:%r[0-9]+]], {[[R0]], [[R1]]}
 ;
 ; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
 ; CHECK-NEXT: ret;
@@ -122,20 +120,19 @@ define <2 x half> @test_fadd_imm_0(<2 x half> %a) #0 {
 }
 
 ; CHECK-LABEL: test_fadd_imm_1(
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fadd_imm_1_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fadd_imm_1_param_0];
 ;
-; CHECK-F16:        mov.u32        [[I:%r[0-9+]]], 1073757184;
-; CHECK-F16:        mov.b32        [[IHH:%hh[0-9+]]], [[I]];
-; CHECK-F16:        add.rn.f16x2   [[R:%hh[0-9]+]], [[B]], [[IHH]];
+; CHECK-F16:        mov.b32        [[I:%r[0-9+]]], 1073757184;
+; CHECK-F16:        add.rn.f16x2   [[R:%r[0-9]+]], [[B]], [[I]];
 ;
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
 ; CHECK-NOF16-DAG:  add.rn.f32     [[FR0:%f[0-9]+]], [[FA0]], 0f3F800000;
 ; CHECK-NOF16-DAG:  add.rn.f32     [[FR1:%f[0-9]+]], [[FA1]], 0f40000000;
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
-; CHECK-NOF16:      mov.b32        [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[FR1]]
+; CHECK-NOF16:      mov.b32        [[R:%r[0-9]+]], {[[R0]], [[R1]]}
 ;
 ; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
 ; CHECK-NEXT: ret;
@@ -145,22 +142,22 @@ define <2 x half> @test_fadd_imm_1(<2 x half> %a) #0 {
 }
 
 ; CHECK-LABEL: test_fsub(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fsub_param_0];
+; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fsub_param_0];
 ;
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fsub_param_1];
-; CHECK-F16-NEXT:   sub.rn.f16x2   [[R:%hh[0-9]+]], [[A]], [[B]];
+; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fsub_param_1];
+; CHECK-F16-NEXT:   sub.rn.f16x2   [[R:%r[0-9]+]], [[A]], [[B]];
 ;
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
 ; CHECK-NOF16-DAG:  sub.rn.f32     [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
 ; CHECK-NOF16-DAG:  sub.rn.f32     [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
-; CHECK-NOF16:      mov.b32        [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[FR1]]
+; CHECK-NOF16:      mov.b32        [[R:%r[0-9]+]], {[[R0]], [[R1]]}
 ;
 ; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
 ; CHECK-NEXT: ret;
@@ -170,21 +167,20 @@ define <2 x half> @test_fsub(<2 x half> %a, <2 x half> %b) #0 {
 }
 
 ; CHECK-LABEL: test_fneg(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fneg_param_0];
+; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fneg_param_0];
 ;
-; CHECK-F16:        mov.u32        [[I0:%r[0-9+]]], 0;
-; CHECK-F16:        mov.b32        [[IHH0:%hh[0-9+]]], [[I0]];
-; CHECK-F16-NEXT:   sub.rn.f16x2   [[R:%hh[0-9]+]], [[IHH0]], [[A]];
+; CHECK-F16:        mov.b32        [[I:%r[0-9+]]], 0;
+; CHECK-F16-NEXT:   sub.rn.f16x2   [[R:%r[0-9]+]], [[I]], [[A]];
 ;
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
 ; CHECK-NOF16-DAG:  mov.f32        [[Z:%f[0-9]+]], 0f00000000;
 ; CHECK-NOF16-DAG:  sub.rn.f32     [[FR0:%f[0-9]+]], [[Z]], [[FA0]];
 ; CHECK-NOF16-DAG:  sub.rn.f32     [[FR1:%f[0-9]+]], [[Z]], [[FA1]];
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
-; CHECK-NOF16:      mov.b32        [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[FR1]]
+; CHECK-NOF16:      mov.b32        [[R:%r[0-9]+]], {[[R0]], [[R1]]}
 ;
 ; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
 ; CHECK-NEXT: ret;
@@ -194,21 +190,21 @@ define <2 x half> @test_fneg(<2 x half> %a) #0 {
 }
 
 ; CHECK-LABEL: test_fmul(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fmul_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fmul_param_1];
-; CHECK-F16-NEXT: mul.rn.f16x2     [[R:%hh[0-9]+]], [[A]], [[B]];
+; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fmul_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fmul_param_1];
+; CHECK-F16-NEXT: mul.rn.f16x2     [[R:%r[0-9]+]], [[A]], [[B]];
 ;
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
 ; CHECK-NOF16-DAG:  mul.rn.f32     [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
 ; CHECK-NOF16-DAG:  mul.rn.f32     [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
-; CHECK-NOF16:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[FR1]]
+; CHECK-NOF16:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
 ;
 ; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
 ; CHECK-NEXT: ret;
@@ -218,19 +214,19 @@ define <2 x half> @test_fmul(<2 x half> %a, <2 x half> %b) #0 {
 }
 
 ; CHECK-LABEL: test_fdiv(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fdiv_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fdiv_param_1];
-; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-DAG:  mov.b32         {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fdiv_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fdiv_param_1];
+; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32         {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
 ; CHECK-DAG:  cvt.f32.f16     [[FA0:%f[0-9]+]], [[A0]];
 ; CHECK-DAG:  cvt.f32.f16     [[FA1:%f[0-9]+]], [[A1]];
 ; CHECK-DAG:  cvt.f32.f16     [[FB0:%f[0-9]+]], [[B0]];
 ; CHECK-DAG:  cvt.f32.f16     [[FB1:%f[0-9]+]], [[B1]];
 ; CHECK-DAG:  div.rn.f32      [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
 ; CHECK-DAG:  div.rn.f32      [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%h[0-9]+]], [[FR0]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%h[0-9]+]], [[FR1]];
-; CHECK-NEXT: mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%rs[0-9]+]], [[FR0]];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%rs[0-9]+]], [[FR1]];
+; CHECK-NEXT: mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
 ; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
 ; CHECK-NEXT: ret;
 define <2 x half> @test_fdiv(<2 x half> %a, <2 x half> %b) #0 {
@@ -240,11 +236,11 @@ define <2 x half> @test_fdiv(<2 x half> %a, <2 x half> %b) #0 {
 
 ; CHECK-LABEL: test_frem(
 ; -- Load two 16x2 inputs and split them into f16 elements
-; CHECK-DAG:  ld.param.b32       [[A:%hh[0-9]+]], [test_frem_param_0];
-; CHECK-DAG:  ld.param.b32       [[B:%hh[0-9]+]], [test_frem_param_1];
+; CHECK-DAG:  ld.param.b32       [[A:%r[0-9]+]], [test_frem_param_0];
+; CHECK-DAG:  ld.param.b32       [[B:%r[0-9]+]], [test_frem_param_1];
 ; -- Split into elements
-; CHECK-DAG:  mov.b32            {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-DAG:  mov.b32            {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG:  mov.b32            {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32            {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
 ; -- promote to f32.
 ; CHECK-DAG:  cvt.f32.f16        [[FA0:%f[0-9]+]], [[A0]];
 ; CHECK-DAG:  cvt.f32.f16        [[FB0:%f[0-9]+]], [[B0]];
@@ -265,10 +261,10 @@ define <2 x half> @test_fdiv(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-DAG:  testp.infinite.f32 [[ISB1INF:%p[0-9]+]], [[FB1]];
 ; CHECK-DAG:  selp.f32           [[RF1:%f[0-9]+]], [[FA1]], [[RFNINF1]], [[ISB1INF]];
 ; -- convert back to f16.
-; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%h[0-9]+]], [[RF0]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%h[0-9]+]], [[RF1]];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%rs[0-9]+]], [[RF0]];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%rs[0-9]+]], [[RF1]];
 ; -- merge into f16x2 and return it.
-; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
 ; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
 ; CHECK-NEXT: ret;
 define <2 x half> @test_frem(<2 x half> %a, <2 x half> %b) #0 {
@@ -279,8 +275,8 @@ define <2 x half> @test_frem(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-LABEL: .func test_ldst_v2f16(
 ; CHECK-DAG:    ld.param.u64    %[[A:rd[0-9]+]], [test_ldst_v2f16_param_0];
 ; CHECK-DAG:    ld.param.u64    %[[B:rd[0-9]+]], [test_ldst_v2f16_param_1];
-; CHECK-DAG:    ld.b32          [[E:%hh[0-9]+]], [%[[A]]]
-; CHECK:        mov.b32         {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[E]];
+; CHECK-DAG:    ld.b32          [[E:%r[0-9]+]], [%[[A]]]
+; CHECK:        mov.b32         {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [[E]];
 ; CHECK-DAG:    st.v2.b16       [%[[B]]], {[[E0]], [[E1]]};
 ; CHECK:        ret;
 define void @test_ldst_v2f16(ptr %a, ptr %b) {
@@ -309,7 +305,7 @@ define void @test_ldst_v3f16(ptr %a, ptr %b) {
 ; CHECK-LABEL: .func test_ldst_v4f16(
 ; CHECK-DAG:    ld.param.u64    %[[A:rd[0-9]+]], [test_ldst_v4f16_param_0];
 ; CHECK-DAG:    ld.param.u64    %[[B:rd[0-9]+]], [test_ldst_v4f16_param_1];
-; CHECK-DAG:    ld.v4.b16       {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]], [[E2:%h[0-9]+]], [[E3:%h[0-9]+]]}, [%[[A]]];
+; CHECK-DAG:    ld.v4.b16       {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [%[[A]]];
 ; CHECK-DAG:    st.v4.b16       [%[[B]]], {[[E0]], [[E1]], [[E2]], [[E3]]};
 ; CHECK:        ret;
 define void @test_ldst_v4f16(ptr %a, ptr %b) {
@@ -333,8 +329,8 @@ define void @test_ldst_v8f16(ptr %a, ptr %b) {
 declare <2 x half> @test_callee(<2 x half> %a, <2 x half> %b) #0
 
 ; CHECK-LABEL: test_call(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_call_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_call_param_1];
+; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_call_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_call_param_1];
 ; CHECK:      {
 ; CHECK-DAG:  .param .align 4 .b8 param0[4];
 ; CHECK-DAG:  .param .align 4 .b8 param1[4];
@@ -344,7 +340,7 @@ declare <2 x half> @test_callee(<2 x half> %a, <2 x half> %b) #0
 ; CHECK:      call.uni (retval0),
 ; CHECK-NEXT:        test_callee,
 ; CHECK:      );
-; CHECK-NEXT: ld.param.b32    [[R:%hh[0-9]+]], [retval0+0];
+; CHECK-NEXT: ld.param.b32    [[R:%r[0-9]+]], [retval0+0];
 ; CHECK-NEXT: }
 ; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
 ; CHECK-NEXT: ret;
@@ -354,8 +350,8 @@ define <2 x half> @test_call(<2 x half> %a, <2 x half> %b) #0 {
 }
 
 ; CHECK-LABEL: test_call_flipped(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_call_flipped_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_call_flipped_param_1];
+; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_call_flipped_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_call_flipped_param_1];
 ; CHECK:      {
 ; CHECK-DAG:  .param .align 4 .b8 param0[4];
 ; CHECK-DAG:  .param .align 4 .b8 param1[4];
@@ -365,7 +361,7 @@ define <2 x half> @test_call(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK:      call.uni (retval0),
 ; CHECK-NEXT:        test_callee,
 ; CHECK:      );
-; CHECK-NEXT: ld.param.b32    [[R:%hh[0-9]+]], [retval0+0];
+; CHECK-NEXT: ld.param.b32    [[R:%r[0-9]+]], [retval0+0];
 ; CHECK-NEXT: }
 ; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
 ; CHECK-NEXT: ret;
@@ -375,8 +371,8 @@ define <2 x half> @test_call_flipped(<2 x half> %a, <2 x half> %b) #0 {
 }
 
 ; CHECK-LABEL: test_tailcall_flipped(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_tailcall_flipped_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_tailcall_flipped_param_1];
+; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_tailcall_flipped_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_tailcall_flipped_param_1];
 ; CHECK:      {
 ; CHECK-DAG:  .param .align 4 .b8 param0[4];
 ; CHECK-DAG:  .param .align 4 .b8 param1[4];
@@ -386,7 +382,7 @@ define <2 x half> @test_call_flipped(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK:      call.uni (retval0),
 ; CHECK-NEXT:        test_callee,
 ; CHECK:      );
-; CHECK-NEXT: ld.param.b32    [[R:%hh[0-9]+]], [retval0+0];
+; CHECK-NEXT: ld.param.b32    [[R:%r[0-9]+]], [retval0+0];
 ; CHECK-NEXT: }
 ; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
 ; CHECK-NEXT: ret;
@@ -396,11 +392,11 @@ define <2 x half> @test_tailcall_flipped(<2 x half> %a, <2 x half> %b) #0 {
 }
 
 ; CHECK-LABEL: test_select(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_select_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_select_param_1];
+; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_select_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_select_param_1];
 ; CHECK-DAG:  ld.param.u8     [[C:%rs[0-9]+]], [test_select_param_2]
 ; CHECK-DAG:  setp.eq.b16     [[PRED:%p[0-9]+]], %rs{{.*}}, 1;
-; CHECK-NEXT: selp.b32        [[R:%hh[0-9]+]], [[A]], [[B]], [[PRED]];
+; CHECK-NEXT: selp.b32        [[R:%r[0-9]+]], [[A]], [[B]], [[PRED]];
 ; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
 ; CHECK-NEXT: ret;
 define <2 x half> @test_select(<2 x half> %a, <2 x half> %b, i1 zeroext %c) #0 {
@@ -409,15 +405,15 @@ define <2 x half> @test_select(<2 x half> %a, <2 x half> %b, i1 zeroext %c) #0 {
 }
 
 ; CHECK-LABEL: test_select_cc(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_select_cc_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_select_cc_param_1];
-; CHECK-DAG:  ld.param.b32    [[C:%hh[0-9]+]], [test_select_cc_param_2];
-; CHECK-DAG:  ld.param.b32    [[D:%hh[0-9]+]], [test_select_cc_param_3];
+; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_select_cc_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_select_cc_param_1];
+; CHECK-DAG:  ld.param.b32    [[C:%r[0-9]+]], [test_select_cc_param_2];
+; CHECK-DAG:  ld.param.b32    [[D:%r[0-9]+]], [test_select_cc_param_3];
 ;
 ; CHECK-F16:  setp.neu.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[C]], [[D]]
 ;
-; CHECK-NOF16-DAG: mov.b32        {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]]
-; CHECK-NOF16-DAG: mov.b32        {[[D0:%h[0-9]+]], [[D1:%h[0-9]+]]}, [[D]]
+; CHECK-NOF16-DAG: mov.b32        {[[C0:%rs[0-9]+]], [[C1:%rs[0-9]+]]}, [[C]]
+; CHECK-NOF16-DAG: mov.b32        {[[D0:%rs[0-9]+]], [[D1:%rs[0-9]+]]}, [[D]]
 ; CHECK-NOF16-DAG: cvt.f32.f16 [[DF0:%f[0-9]+]], [[D0]];
 ; CHECK-NOF16-DAG: cvt.f32.f16 [[CF0:%f[0-9]+]], [[C0]];
 ; CHECK-NOF16-DAG: cvt.f32.f16 [[DF1:%f[0-9]+]], [[D1]];
@@ -425,11 +421,11 @@ define <2 x half> @test_select(<2 x half> %a, <2 x half> %b, i1 zeroext %c) #0 {
 ; CHECK-NOF16-DAG: setp.neu.f32    [[P0:%p[0-9]+]], [[CF0]], [[DF0]]
 ; CHECK-NOF16-DAG: setp.neu.f32    [[P1:%p[0-9]+]], [[CF1]], [[DF1]]
 ;
-; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-DAG:  mov.b32         {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-DAG:  selp.b16        [[R0:%h[0-9]+]], [[A0]], [[B0]], [[P0]];
-; CHECK-DAG:  selp.b16        [[R1:%h[0-9]+]], [[A1]], [[B1]], [[P1]];
-; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32         {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
+; CHECK-DAG:  selp.b16        [[R0:%rs[0-9]+]], [[A0]], [[B0]], [[P0]];
+; CHECK-DAG:  selp.b16        [[R1:%rs[0-9]+]], [[A1]], [[B1]], [[P1]];
+; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
 ; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
 ; CHECK-NEXT: ret;
 define <2 x half> @test_select_cc(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) #0 {
@@ -441,12 +437,12 @@ define <2 x half> @test_select_cc(<2 x half> %a, <2 x half> %b, <2 x half> %c, <
 ; CHECK-LABEL: test_select_cc_f32_f16(
 ; CHECK-DAG:  ld.param.v2.f32    {[[A0:%f[0-9]+]], [[A1:%f[0-9]+]]}, [test_select_cc_f32_f16_param_0];
 ; CHECK-DAG:  ld.param.v2.f32    {[[B0:%f[0-9]+]], [[B1:%f[0-9]+]]}, [test_select_cc_f32_f16_param_1];
-; CHECK-DAG:  ld.param.b32    [[C:%hh[0-9]+]], [test_select_cc_f32_f16_param_2];
-; CHECK-DAG:  ld.param.b32    [[D:%hh[0-9]+]], [test_select_cc_f32_f16_param_3];
+; CHECK-DAG:  ld.param.b32    [[C:%r[0-9]+]], [test_select_cc_f32_f16_param_2];
+; CHECK-DAG:  ld.param.b32    [[D:%r[0-9]+]], [test_select_cc_f32_f16_param_3];
 ;
 ; CHECK-F16:  setp.neu.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[C]], [[D]]
-; CHECK-NOF16-DAG: mov.b32         {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]]
-; CHECK-NOF16-DAG: mov.b32         {[[D0:%h[0-9]+]], [[D1:%h[0-9]+]]}, [[D]]
+; CHECK-NOF16-DAG: mov.b32         {[[C0:%rs[0-9]+]], [[C1:%rs[0-9]+]]}, [[C]]
+; CHECK-NOF16-DAG: mov.b32         {[[D0:%rs[0-9]+]], [[D1:%rs[0-9]+]]}, [[D]]
 ; CHECK-NOF16-DAG: cvt.f32.f16 [[DF0:%f[0-9]+]], [[D0]];
 ; CHECK-NOF16-DAG: cvt.f32.f16 [[CF0:%f[0-9]+]], [[C0]];
 ; CHECK-NOF16-DAG: cvt.f32.f16 [[DF1:%f[0-9]+]], [[D1]];
@@ -466,17 +462,17 @@ define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b,
 }
 
 ; CHECK-LABEL: test_select_cc_f16_f32(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_select_cc_f16_f32_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_select_cc_f16_f32_param_1];
+; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_select_cc_f16_f32_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_select_cc_f16_f32_param_1];
 ; CHECK-DAG:  ld.param.v2.f32 {[[C0:%f[0-9]+]], [[C1:%f[0-9]+]]}, [test_select_cc_f16_f32_param_2];
 ; CHECK-DAG:  ld.param.v2.f32 {[[D0:%f[0-9]+]], [[D1:%f[0-9]+]]}, [test_select_cc_f16_f32_param_3];
 ; CHECK-DAG:  setp.neu.f32    [[P0:%p[0-9]+]], [[C0]], [[D0]]
 ; CHECK-DAG:  setp.neu.f32    [[P1:%p[0-9]+]], [[C1]], [[D1]]
-; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-DAG:  mov.b32         {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-DAG:  selp.b16        [[R0:%h[0-9]+]], [[A0]], [[B0]], [[P0]];
-; CHECK-DAG:  selp.b16        [[R1:%h[0-9]+]], [[A1]], [[B1]], [[P1]];
-; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32         {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
+; CHECK-DAG:  selp.b16        [[R0:%rs[0-9]+]], [[A0]], [[B0]], [[P0]];
+; CHECK-DAG:  selp.b16        [[R1:%rs[0-9]+]], [[A1]], [[B1]], [[P1]];
+; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
 ; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
 ; CHECK-NEXT: ret;
 define <2 x half> @test_select_cc_f16_f32(<2 x half> %a, <2 x half> %b,
@@ -487,11 +483,11 @@ define <2 x half> @test_select_cc_f16_f32(<2 x half> %a, <2 x half> %b,
 }
 
 ; CHECK-LABEL: test_fcmp_une(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_une_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_une_param_1];
+; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fcmp_une_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fcmp_une_param_1];
 ; CHECK-F16:  setp.neu.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
@@ -509,11 +505,11 @@ define <2 x i1> @test_fcmp_une(<2 x half> %a, <2 x half> %b) #0 {
 }
 
 ; CHECK-LABEL: test_fcmp_ueq(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_ueq_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_ueq_param_1];
+; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fcmp_ueq_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fcmp_ueq_param_1];
 ; CHECK-F16:  setp.equ.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
@@ -531,11 +527,11 @@ define <2 x i1> @test_fcmp_ueq(<2 x half> %a, <2 x half> %b) #0 {
 }
 
 ; CHECK-LABEL: test_fcmp_ugt(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_ugt_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_ugt_param_1];
+; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fcmp_ugt_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fcmp_ugt_param_1];
 ; CHECK-F16:  setp.gtu.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
@@ -553,11 +549,11 @@ define <2 x i1> @test_fcmp_ugt(<2 x half> %a, <2 x half> %b) #0 {
 }
 
 ; CHECK-LABEL: test_fcmp_uge(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_uge_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_uge_param_1];
+; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fcmp_uge_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fcmp_uge_param_1];
 ; CHECK-F16:  setp.geu.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
@@ -575,11 +571,11 @@ define <2 x i1> @test_fcmp_uge(<2 x half> %a, <2 x half> %b) #0 {
 }
 
 ; CHECK-LABEL: test_fcmp_ult(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_ult_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_ult_param_1];
+; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fcmp_ult_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fcmp_ult_param_1];
 ; CHECK-F16:  setp.ltu.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
@@ -597,11 +593,11 @@ define <2 x i1> @test_fcmp_ult(<2 x half> %a, <2 x half> %b) #0 {
 }
 
 ; CHECK-LABEL: test_fcmp_ule(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_ule_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_ule_param_1];
+; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fcmp_ule_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fcmp_ule_param_1];
 ; CHECK-F16:  setp.leu.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
@@ -620,11 +616,11 @@ define <2 x i1> @test_fcmp_ule(<2 x half> %a, <2 x half> %b) #0 {
 
 
 ; CHECK-LABEL: test_fcmp_uno(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_uno_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_uno_param_1];
+; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fcmp_uno_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fcmp_uno_param_1];
 ; CHECK-F16:  setp.nan.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
@@ -642,11 +638,11 @@ define <2 x i1> @test_fcmp_uno(<2 x half> %a, <2 x half> %b) #0 {
 }
 
 ; CHECK-LABEL: test_fcmp_one(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_one_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_one_param_1];
+; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fcmp_one_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fcmp_one_param_1];
 ; CHECK-F16:  setp.ne.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
@@ -664,11 +660,11 @@ define <2 x i1> @test_fcmp_one(<2 x half> %a, <2 x half> %b) #0 {
 }
 
 ; CHECK-LABEL: test_fcmp_oeq(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_oeq_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_oeq_param_1];
+; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fcmp_oeq_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fcmp_oeq_param_1];
 ; CHECK-F16:  setp.eq.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
@@ -686,11 +682,11 @@ define <2 x i1> @test_fcmp_oeq(<2 x half> %a, <2 x half> %b) #0 {
 }
 
 ; CHECK-LABEL: test_fcmp_ogt(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_ogt_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_ogt_param_1];
+; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fcmp_ogt_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fcmp_ogt_param_1];
 ; CHECK-F16:  setp.gt.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
@@ -708,11 +704,11 @@ define <2 x i1> @test_fcmp_ogt(<2 x half> %a, <2 x half> %b) #0 {
 }
 
 ; CHECK-LABEL: test_fcmp_oge(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_oge_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_oge_param_1];
+; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fcmp_oge_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fcmp_oge_param_1];
 ; CHECK-F16:  setp.ge.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
@@ -730,11 +726,11 @@ define <2 x i1> @test_fcmp_oge(<2 x half> %a, <2 x half> %b) #0 {
 }
 
 ; CHECK-LABEL: test_fcmp_olt(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_olt_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_olt_param_1];
+; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fcmp_olt_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fcmp_olt_param_1];
 ; CHECK-F16:  setp.lt.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
@@ -752,11 +748,11 @@ define <2 x i1> @test_fcmp_olt(<2 x half> %a, <2 x half> %b) #0 {
 }
 
 ; XCHECK-LABEL: test_fcmp_ole(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_ole_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_ole_param_1];
+; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fcmp_ole_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fcmp_ole_param_1];
 ; CHECK-F16:  setp.le.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
@@ -774,11 +770,11 @@ define <2 x i1> @test_fcmp_ole(<2 x half> %a, <2 x half> %b) #0 {
 }
 
 ; CHECK-LABEL: test_fcmp_ord(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_ord_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_ord_param_1];
+; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fcmp_ord_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fcmp_ord_param_1];
 ; CHECK-F16:  setp.num.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
@@ -796,8 +792,8 @@ define <2 x i1> @test_fcmp_ord(<2 x half> %a, <2 x half> %b) #0 {
 }
 
 ; CHECK-LABEL: test_fptosi_i32(
-; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_fptosi_i32_param_0];
-; CHECK:      mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_fptosi_i32_param_0];
+; CHECK:      mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
 ; CHECK-DAG:  cvt.rzi.s32.f16 [[R0:%r[0-9]+]], [[A0]];
 ; CHECK-DAG:  cvt.rzi.s32.f16 [[R1:%r[0-9]+]], [[A1]];
 ; CHECK:      st.param.v2.b32 [func_retval0+0], {[[R0]], [[R1]]}
@@ -808,8 +804,8 @@ define <2 x i32> @test_fptosi_i32(<2 x half> %a) #0 {
 }
 
 ; CHECK-LABEL: test_fptosi_i64(
-; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_fptosi_i64_param_0];
-; CHECK:      mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_fptosi_i64_param_0];
+; CHECK:      mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
 ; CHECK-DAG:  cvt.rzi.s64.f16 [[R0:%rd[0-9]+]], [[A0]];
 ; CHECK-DAG:  cvt.rzi.s64.f16 [[R1:%rd[0-9]+]], [[A1]];
 ; CHECK:      st.param.v2.b64 [func_retval0+0], {[[R0]], [[R1]]}
@@ -820,8 +816,8 @@ define <2 x i64> @test_fptosi_i64(<2 x half> %a) #0 {
 }
 
 ; CHECK-LABEL: test_fptoui_2xi32(
-; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_fptoui_2xi32_param_0];
-; CHECK:      mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_fptoui_2xi32_param_0];
+; CHECK:      mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
 ; CHECK-DAG:  cvt.rzi.u32.f16 [[R0:%r[0-9]+]], [[A0]];
 ; CHECK-DAG:  cvt.rzi.u32.f16 [[R1:%r[0-9]+]], [[A1]];
 ; CHECK:      st.param.v2.b32 [func_retval0+0], {[[R0]], [[R1]]}
@@ -832,8 +828,8 @@ define <2 x i32> @test_fptoui_2xi32(<2 x half> %a) #0 {
 }
 
 ; CHECK-LABEL: test_fptoui_2xi64(
-; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_fptoui_2xi64_param_0];
-; CHECK:      mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_fptoui_2xi64_param_0];
+; CHECK:      mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
 ; CHECK-DAG:  cvt.rzi.u64.f16 [[R0:%rd[0-9]+]], [[A0]];
 ; CHECK-DAG:  cvt.rzi.u64.f16 [[R1:%rd[0-9]+]], [[A1]];
 ; CHECK:      st.param.v2.b64 [func_retval0+0], {[[R0]], [[R1]]}
@@ -845,9 +841,9 @@ define <2 x i64> @test_fptoui_2xi64(<2 x half> %a) #0 {
 
 ; CHECK-LABEL: test_uitofp_2xi32(
 ; CHECK:      ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_uitofp_2xi32_param_0];
-; CHECK-DAG:  cvt.rn.f16.u32  [[R0:%h[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.rn.f16.u32  [[R1:%h[0-9]+]], [[A1]];
-; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-DAG:  cvt.rn.f16.u32  [[R0:%rs[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.rn.f16.u32  [[R1:%rs[0-9]+]], [[A1]];
+; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
 ; CHECK:      st.param.b32    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define <2 x half> @test_uitofp_2xi32(<2 x i32> %a) #0 {
@@ -857,9 +853,9 @@ define <2 x half> @test_uitofp_2xi32(<2 x i32> %a) #0 {
 
 ; CHECK-LABEL: test_uitofp_2xi64(
 ; CHECK:      ld.param.v2.u64 {[[A0:%rd[0-9]+]], [[A1:%rd[0-9]+]]}, [test_uitofp_2xi64_param_0];
-; CHECK-DAG:  cvt.rn.f16.u64  [[R0:%h[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.rn.f16.u64  [[R1:%h[0-9]+]], [[A1]];
-; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-DAG:  cvt.rn.f16.u64  [[R0:%rs[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.rn.f16.u64  [[R1:%rs[0-9]+]], [[A1]];
+; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
 ; CHECK:      st.param.b32    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define <2 x half> @test_uitofp_2xi64(<2 x i64> %a) #0 {
@@ -869,9 +865,9 @@ define <2 x half> @test_uitofp_2xi64(<2 x i64> %a) #0 {
 
 ; CHECK-LABEL: test_sitofp_2xi32(
 ; CHECK:      ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_sitofp_2xi32_param_0];
-; CHECK-DAG:  cvt.rn.f16.s32  [[R0:%h[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.rn.f16.s32  [[R1:%h[0-9]+]], [[A1]];
-; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-DAG:  cvt.rn.f16.s32  [[R0:%rs[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.rn.f16.s32  [[R1:%rs[0-9]+]], [[A1]];
+; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
 ; CHECK:      st.param.b32    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define <2 x half> @test_sitofp_2xi32(<2 x i32> %a) #0 {
@@ -881,9 +877,9 @@ define <2 x half> @test_sitofp_2xi32(<2 x i32> %a) #0 {
 
 ; CHECK-LABEL: test_sitofp_2xi64(
 ; CHECK:      ld.param.v2.u64 {[[A0:%rd[0-9]+]], [[A1:%rd[0-9]+]]}, [test_sitofp_2xi64_param_0];
-; CHECK-DAG:  cvt.rn.f16.s64  [[R0:%h[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.rn.f16.s64  [[R1:%h[0-9]+]], [[A1]];
-; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-DAG:  cvt.rn.f16.s64  [[R0:%rs[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.rn.f16.s64  [[R1:%rs[0-9]+]], [[A1]];
+; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
 ; CHECK:      st.param.b32    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define <2 x half> @test_sitofp_2xi64(<2 x i64> %a) #0 {
@@ -893,23 +889,23 @@ define <2 x half> @test_sitofp_2xi64(<2 x i64> %a) #0 {
 
 ; CHECK-LABEL: test_uitofp_2xi32_fadd(
 ; CHECK-DAG:  ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_uitofp_2xi32_fadd_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_uitofp_2xi32_fadd_param_1];
-; CHECK-DAG:  cvt.rn.f16.u32  [[C0:%h[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.rn.f16.u32  [[C1:%h[0-9]+]], [[A1]];
+; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_uitofp_2xi32_fadd_param_1];
+; CHECK-DAG:  cvt.rn.f16.u32  [[C0:%rs[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.rn.f16.u32  [[C1:%rs[0-9]+]], [[A1]];
 
-; CHECK-F16-DAG:  mov.b32         [[C:%hh[0-9]+]], {[[C0]], [[C1]]}
-; CHECK-F16-DAG:  add.rn.f16x2    [[R:%hh[0-9]+]], [[B]], [[C]];
+; CHECK-F16-DAG:  mov.b32         [[C:%r[0-9]+]], {[[C0]], [[C1]]}
+; CHECK-F16-DAG:  add.rn.f16x2    [[R:%r[0-9]+]], [[B]], [[C]];
 ;
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FC0:%f[0-9]+]], [[C0]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FC1:%f[0-9]+]], [[C1]]
 ; CHECK-NOF16-DAG:  add.rn.f32     [[FR0:%f[0-9]+]], [[FB0]], [[FC0]];
 ; CHECK-NOF16-DAG:  add.rn.f32     [[FR1:%f[0-9]+]], [[FB1]], [[FC1]];
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
-; CHECK-NOF16:      mov.b32        [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[FR1]]
+; CHECK-NOF16:      mov.b32        [[R:%r[0-9]+]], {[[R0]], [[R1]]}
 ;
 ; CHECK:      st.param.b32    [func_retval0+0], [[R]];
 ; CHECK:      ret;
@@ -921,23 +917,23 @@ define <2 x half> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 {
 
 ; CHECK-LABEL: test_sitofp_2xi32_fadd(
 ; CHECK-DAG:  ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_sitofp_2xi32_fadd_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_sitofp_2xi32_fadd_param_1];
-; CHECK-DAG:  cvt.rn.f16.s32  [[C0:%h[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.rn.f16.s32  [[C1:%h[0-9]+]], [[A1]];
+; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_sitofp_2xi32_fadd_param_1];
+; CHECK-DAG:  cvt.rn.f16.s32  [[C0:%rs[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.rn.f16.s32  [[C1:%rs[0-9]+]], [[A1]];
 ;
-; CHECK-F16-DAG:  mov.b32         [[C:%hh[0-9]+]], {[[C0]], [[C1]]}
-; CHECK-F16-DAG:  add.rn.f16x2    [[R:%hh[0-9]+]], [[B]], [[C]];
+; CHECK-F16-DAG:  mov.b32         [[C:%r[0-9]+]], {[[C0]], [[C1]]}
+; CHECK-F16-DAG:  add.rn.f16x2    [[R:%r[0-9]+]], [[B]], [[C]];
 ;
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FC0:%f[0-9]+]], [[C0]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FC1:%f[0-9]+]], [[C1]]
 ; CHECK-NOF16-DAG:  add.rn.f32     [[FR0:%f[0-9]+]], [[FB0]], [[FC0]];
 ; CHECK-NOF16-DAG:  add.rn.f32     [[FR1:%f[0-9]+]], [[FB1]], [[FC1]];
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
-; CHECK-NOF16:      mov.b32        [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[FR1]]
+; CHECK-NOF16:      mov.b32        [[R:%r[0-9]+]], {[[R0]], [[R1]]}
 ;
 ; CHECK:      st.param.b32    [func_retval0+0], [[R]];
 ; CHECK:      ret;
@@ -949,9 +945,9 @@ define <2 x half> @test_sitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 {
 
 ; CHECK-LABEL: test_fptrunc_2xfloat(
 ; CHECK:      ld.param.v2.f32 {[[A0:%f[0-9]+]], [[A1:%f[0-9]+]]}, [test_fptrunc_2xfloat_param_0];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%h[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%h[0-9]+]], [[A1]];
-; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%rs[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%rs[0-9]+]], [[A1]];
+; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
 ; CHECK:      st.param.b32    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define <2 x half> @test_fptrunc_2xfloat(<2 x float> %a) #0 {
@@ -961,9 +957,9 @@ define <2 x half> @test_fptrunc_2xfloat(<2 x float> %a) #0 {
 
 ; CHECK-LABEL: test_fptrunc_2xdouble(
 ; CHECK:      ld.param.v2.f64 {[[A0:%fd[0-9]+]], [[A1:%fd[0-9]+]]}, [test_fptrunc_2xdouble_param_0];
-; CHECK-DAG:  cvt.rn.f16.f64  [[R0:%h[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.rn.f16.f64  [[R1:%h[0-9]+]], [[A1]];
-; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-DAG:  cvt.rn.f16.f64  [[R0:%rs[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.rn.f16.f64  [[R1:%rs[0-9]+]], [[A1]];
+; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
 ; CHECK:      st.param.b32    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define <2 x half> @test_fptrunc_2xdouble(<2 x double> %a) #0 {
@@ -972,8 +968,8 @@ define <2 x half> @test_fptrunc_2xdouble(<2 x double> %a) #0 {
 }
 
 ; CHECK-LABEL: test_fpext_2xfloat(
-; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_fpext_2xfloat_param_0];
-; CHECK:      mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_fpext_2xfloat_param_0];
+; CHECK:      mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
 ; CHECK-DAG:  cvt.f32.f16     [[R0:%f[0-9]+]], [[A0]];
 ; CHECK-DAG:  cvt.f32.f16     [[R1:%f[0-9]+]], [[A1]];
 ; CHECK-NEXT: st.param.v2.f32 [func_retval0+0], {[[R0]], [[R1]]};
@@ -984,8 +980,8 @@ define <2 x float> @test_fpext_2xfloat(<2 x half> %a) #0 {
 }
 
 ; CHECK-LABEL: test_fpext_2xdouble(
-; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_fpext_2xdouble_param_0];
-; CHECK:      mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_fpext_2xdouble_param_0];
+; CHECK:      mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
 ; CHECK-DAG:  cvt.f64.f16     [[R0:%fd[0-9]+]], [[A0]];
 ; CHECK-DAG:  cvt.f64.f16     [[R1:%fd[0-9]+]], [[A1]];
 ; CHECK-NEXT: st.param.v2.f64 [func_retval0+0], {[[R0]], [[R1]]};
@@ -1012,8 +1008,7 @@ define <2 x i16> @test_bitcast_2xhalf_to_2xi16(<2 x half> %a) #0 {
 ; CHECK-DAG:  cvt.u32.u16     [[R0:%r[0-9]+]], [[RS0]];
 ; CHECK-DAG:  cvt.u32.u16     [[R1:%r[0-9]+]], [[RS1]];
 ; CHECK-DAG:  shl.b32         [[R1H:%r[0-9]+]], [[R1]], 16;
-; CHECK-DAG:  or.b32          [[R1H0L:%r[0-9]+]], [[R0]], [[R1H]];
-; CHECK:      mov.b32         [[R:%hh[0-9]+]], [[R1H0L]];
+; CHECK-DAG:  or.b32          [[R:%r[0-9]+]], [[R0]], [[R1H]];
 ; CHECK:      st.param.b32    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define <2 x half> @test_bitcast_2xi16_to_2xhalf(<2 x i16> %a) #0 {
@@ -1023,7 +1018,7 @@ define <2 x half> @test_bitcast_2xi16_to_2xhalf(<2 x i16> %a) #0 {
 
 ; CHECK-LABEL: test_bitcast_float_to_2xhalf(
 ; CHECK: ld.param.f32 	[[AF1:%f[0-9]+]], [test_bitcast_float_to_2xhalf_param_0];
-; CHECK: mov.b32 	[[R:%hh[0-9]+]], [[AF1]];
+; CHECK: mov.b32 	[[R:%r[0-9]+]], [[AF1]];
 ; CHECK: st.param.b32 	[func_retval0+0], [[R]];
 ; CHECK: ret;
 define <2 x half> @test_bitcast_float_to_2xhalf(float %a) #0 {
@@ -1066,15 +1061,15 @@ declare <2 x half> @llvm.roundeven.f16(<2 x half> %a) #0
 declare <2 x half> @llvm.fmuladd.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0
 
 ; CHECK-LABEL: test_sqrt(
-; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_sqrt_param_0];
-; CHECK:      mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_sqrt_param_0];
+; CHECK:      mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
 ; CHECK-DAG:  cvt.f32.f16     [[AF0:%f[0-9]+]], [[A0]];
 ; CHECK-DAG:  cvt.f32.f16     [[AF1:%f[0-9]+]], [[A1]];
 ; CHECK-DAG:  sqrt.rn.f32     [[RF0:%f[0-9]+]], [[AF0]];
 ; CHECK-DAG:  sqrt.rn.f32     [[RF1:%f[0-9]+]], [[AF1]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%h[0-9]+]], [[RF0]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%h[0-9]+]], [[RF1]];
-; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%rs[0-9]+]], [[RF0]];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%rs[0-9]+]], [[RF1]];
+; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
 ; CHECK:      st.param.b32    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define <2 x half> @test_sqrt(<2 x half> %a) #0 {
@@ -1090,15 +1085,15 @@ define <2 x half> @test_sqrt(<2 x half> %a) #0 {
 ;}
 
 ; CHECK-LABEL: test_sin(
-; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_sin_param_0];
-; CHECK:      mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_sin_param_0];
+; CHECK:      mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
 ; CHECK-DAG:  cvt.f32.f16     [[AF0:%f[0-9]+]], [[A0]];
 ; CHECK-DAG:  cvt.f32.f16     [[AF1:%f[0-9]+]], [[A1]];
 ; CHECK-DAG:  sin.approx.f32  [[RF0:%f[0-9]+]], [[AF0]];
 ; CHECK-DAG:  sin.approx.f32  [[RF1:%f[0-9]+]], [[AF1]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%h[0-9]+]], [[RF0]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%h[0-9]+]], [[RF1]];
-; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%rs[0-9]+]], [[RF0]];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%rs[0-9]+]], [[RF1]];
+; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
 ; CHECK:      st.param.b32    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define <2 x half> @test_sin(<2 x half> %a) #0 #1 {
@@ -1107,15 +1102,15 @@ define <2 x half> @test_sin(<2 x half> %a) #0 #1 {
 }
 
 ; CHECK-LABEL: test_cos(
-; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_cos_param_0];
-; CHECK:      mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_cos_param_0];
+; CHECK:      mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
 ; CHECK-DAG:  cvt.f32.f16     [[AF0:%f[0-9]+]], [[A0]];
 ; CHECK-DAG:  cvt.f32.f16     [[AF1:%f[0-9]+]], [[A1]];
 ; CHECK-DAG:  cos.approx.f32  [[RF0:%f[0-9]+]], [[AF0]];
 ; CHECK-DAG:  cos.approx.f32  [[RF1:%f[0-9]+]], [[AF1]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%h[0-9]+]], [[RF0]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%h[0-9]+]], [[RF1]];
-; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%rs[0-9]+]], [[RF0]];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%rs[0-9]+]], [[RF1]];
+; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
 ; CHECK:      st.param.b32    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define <2 x half> @test_cos(<2 x half> %a) #0 #1 {
@@ -1166,15 +1161,15 @@ define <2 x half> @test_cos(<2 x half> %a) #0 #1 {
 ;}
 
 ; CHECK-LABEL: test_fma(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fma_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fma_param_1];
-; CHECK-DAG:  ld.param.b32    [[C:%hh[0-9]+]], [test_fma_param_2];
+; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fma_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fma_param_1];
+; CHECK-DAG:  ld.param.b32    [[C:%r[0-9]+]], [test_fma_param_2];
 ;
-; CHECK-F16:        fma.rn.f16x2   [[R:%hh[0-9]+]], [[A]], [[B]], [[C]];
+; CHECK-F16:        fma.rn.f16x2   [[R:%r[0-9]+]], [[A]], [[B]], [[C]];
 ;
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG:  mov.b32        {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]]
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[C0:%rs[0-9]+]], [[C1:%rs[0-9]+]]}, [[C]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FC0:%f[0-9]+]], [[C0]]
@@ -1183,9 +1178,9 @@ define <2 x half> @test_cos(<2 x half> %a) #0 #1 {
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FC0:%f[0-9]+]], [[C0]]
 ; CHECK-NOF16-DAG:  fma.rn.f32     [[FR0:%f[0-9]+]], [[FA0]], [[FB0]], [[FC0]];
 ; CHECK-NOF16-DAG:  fma.rn.f32     [[FR1:%f[0-9]+]], [[FA1]], [[FB1]], [[FC1]];
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
-; CHECK-NOF16:      mov.b32        [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[FR1]]
+; CHECK-NOF16:      mov.b32        [[R:%r[0-9]+]], {[[R0]], [[R1]]}
 
 ; CHECK:      st.param.b32    [func_retval0+0], [[R]];
 ; CHECK:      ret
@@ -1195,15 +1190,15 @@ define <2 x half> @test_fma(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 {
 }
 
 ; CHECK-LABEL: test_fabs(
-; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_fabs_param_0];
-; CHECK:      mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_fabs_param_0];
+; CHECK:      mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
 ; CHECK-DAG:  cvt.f32.f16     [[AF0:%f[0-9]+]], [[A0]];
 ; CHECK-DAG:  cvt.f32.f16     [[AF1:%f[0-9]+]], [[A1]];
 ; CHECK-DAG:  abs.f32         [[RF0:%f[0-9]+]], [[AF0]];
 ; CHECK-DAG:  abs.f32         [[RF1:%f[0-9]+]], [[AF1]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%h[0-9]+]], [[RF0]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%h[0-9]+]], [[RF1]];
-; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%rs[0-9]+]], [[RF0]];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%rs[0-9]+]], [[RF1]];
+; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
 ; CHECK:      st.param.b32    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define <2 x half> @test_fabs(<2 x half> %a) #0 {
@@ -1212,19 +1207,19 @@ define <2 x half> @test_fabs(<2 x half> %a) #0 {
 }
 
 ; CHECK-LABEL: test_minnum(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_minnum_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_minnum_param_1];
-; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-DAG:  mov.b32         {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_minnum_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_minnum_param_1];
+; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32         {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
 ; CHECK-DAG:  cvt.f32.f16     [[AF0:%f[0-9]+]], [[A0]];
 ; CHECK-DAG:  cvt.f32.f16     [[AF1:%f[0-9]+]], [[A1]];
 ; CHECK-DAG:  cvt.f32.f16     [[BF0:%f[0-9]+]], [[B0]];
 ; CHECK-DAG:  cvt.f32.f16     [[BF1:%f[0-9]+]], [[B1]];
 ; CHECK-DAG:  min.f32         [[RF0:%f[0-9]+]], [[AF0]], [[BF0]];
 ; CHECK-DAG:  min.f32         [[RF1:%f[0-9]+]], [[AF1]], [[BF1]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%h[0-9]+]], [[RF0]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%h[0-9]+]], [[RF1]];
-; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%rs[0-9]+]], [[RF0]];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%rs[0-9]+]], [[RF1]];
+; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
 ; CHECK:      st.param.b32    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define <2 x half> @test_minnum(<2 x half> %a, <2 x half> %b) #0 {
@@ -1233,19 +1228,19 @@ define <2 x half> @test_minnum(<2 x half> %a, <2 x half> %b) #0 {
 }
 
 ; CHECK-LABEL: test_maxnum(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_maxnum_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_maxnum_param_1];
-; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-DAG:  mov.b32         {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_maxnum_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_maxnum_param_1];
+; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32         {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
 ; CHECK-DAG:  cvt.f32.f16     [[AF0:%f[0-9]+]], [[A0]];
 ; CHECK-DAG:  cvt.f32.f16     [[AF1:%f[0-9]+]], [[A1]];
 ; CHECK-DAG:  cvt.f32.f16     [[BF0:%f[0-9]+]], [[B0]];
 ; CHECK-DAG:  cvt.f32.f16     [[BF1:%f[0-9]+]], [[B1]];
 ; CHECK-DAG:  max.f32         [[RF0:%f[0-9]+]], [[AF0]], [[BF0]];
 ; CHECK-DAG:  max.f32         [[RF1:%f[0-9]+]], [[AF1]], [[BF1]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%h[0-9]+]], [[RF0]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%h[0-9]+]], [[RF1]];
-; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%rs[0-9]+]], [[RF0]];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%rs[0-9]+]], [[RF1]];
+; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
 ; CHECK:      st.param.b32    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define <2 x half> @test_maxnum(<2 x half> %a, <2 x half> %b) #0 {
@@ -1254,23 +1249,17 @@ define <2 x half> @test_maxnum(<2 x half> %a, <2 x half> %b) #0 {
 }
 
 ; CHECK-LABEL: test_copysign(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_copysign_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_copysign_param_1];
-; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-DAG:  mov.b32         {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-DAG:  mov.b16         [[AS0:%rs[0-9]+]], [[A0]];
-; CHECK-DAG:  mov.b16         [[AS1:%rs[0-9]+]], [[A1]];
-; CHECK-DAG:  mov.b16         [[BS0:%rs[0-9]+]], [[B0]];
-; CHECK-DAG:  mov.b16         [[BS1:%rs[0-9]+]], [[B1]];
-; CHECK-DAG:  and.b16         [[AX0:%rs[0-9]+]], [[AS0]], 32767;
-; CHECK-DAG:  and.b16         [[AX1:%rs[0-9]+]], [[AS1]], 32767;
-; CHECK-DAG:  and.b16         [[BX0:%rs[0-9]+]], [[BS0]], -32768;
-; CHECK-DAG:  and.b16         [[BX1:%rs[0-9]+]], [[BS1]], -32768;
-; CHECK-DAG:  or.b16          [[RS0:%rs[0-9]+]], [[AX0]], [[BX0]];
-; CHECK-DAG:  or.b16          [[RS1:%rs[0-9]+]], [[AX1]], [[BX1]];
-; CHECK-DAG:  mov.b16         [[R0:%h[0-9]+]], [[RS0]];
-; CHECK-DAG:  mov.b16         [[R1:%h[0-9]+]], [[RS1]];
-; CHECK-DAG:  mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_copysign_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_copysign_param_1];
+; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32         {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
+; CHECK-DAG:  and.b16         [[AX0:%rs[0-9]+]], [[A0]], 32767;
+; CHECK-DAG:  and.b16         [[AX1:%rs[0-9]+]], [[A1]], 32767;
+; CHECK-DAG:  and.b16         [[BX0:%rs[0-9]+]], [[B0]], -32768;
+; CHECK-DAG:  and.b16         [[BX1:%rs[0-9]+]], [[B1]], -32768;
+; CHECK-DAG:  or.b16          [[R0:%rs[0-9]+]], [[AX0]], [[BX0]];
+; CHECK-DAG:  or.b16          [[R1:%rs[0-9]+]], [[AX1]], [[BX1]];
+; CHECK-DAG:  mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
 ; CHECK:      st.param.b32    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define <2 x half> @test_copysign(<2 x half> %a, <2 x half> %b) #0 {
@@ -1279,24 +1268,20 @@ define <2 x half> @test_copysign(<2 x half> %a, <2 x half> %b) #0 {
 }
 
 ; CHECK-LABEL: test_copysign_f32(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_copysign_f32_param_0];
+; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_copysign_f32_param_0];
 ; CHECK-DAG:  ld.param.v2.f32 {[[B0:%f[0-9]+]], [[B1:%f[0-9]+]]}, [test_copysign_f32_param_1];
-; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-DAG:  mov.b16         [[AS0:%rs[0-9]+]], [[A0]];
-; CHECK-DAG:  mov.b16         [[AS1:%rs[0-9]+]], [[A1]];
+; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
 ; CHECK-DAG:  mov.b32         [[BI0:%r[0-9]+]], [[B0]];
 ; CHECK-DAG:  mov.b32         [[BI1:%r[0-9]+]], [[B1]];
-; CHECK-DAG:  and.b16         [[AI0:%rs[0-9]+]], [[AS0]], 32767;
-; CHECK-DAG:  and.b16         [[AI1:%rs[0-9]+]], [[AS1]], 32767;
+; CHECK-DAG:  and.b16         [[AI0:%rs[0-9]+]], [[A0]], 32767;
+; CHECK-DAG:  and.b16         [[AI1:%rs[0-9]+]], [[A1]], 32767;
 ; CHECK-DAG:  and.b32         [[BX0:%r[0-9]+]], [[BI0]], -2147483648;
 ; CHECK-DAG:  and.b32         [[BX1:%r[0-9]+]], [[BI1]], -2147483648;
 ; CHECK-DAG:  mov.b32         {tmp, [[BZ0:%rs[0-9]+]]}, [[BX0]]; }
 ; CHECK-DAG:  mov.b32         {tmp, [[BZ1:%rs[0-9]+]]}, [[BX1]]; }
-; CHECK-DAG:  or.b16          [[RS0:%rs[0-9]+]], [[AI0]], [[BZ0]];
-; CHECK-DAG:  or.b16          [[RS1:%rs[0-9]+]], [[AI1]], [[BZ1]];
-; CHECK-DAG:  mov.b16         [[R0:%h[0-9]+]], [[RS0]];
-; CHECK-DAG:  mov.b16         [[R1:%h[0-9]+]], [[RS1]];
-; CHECK-DAG:  mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-DAG:  or.b16          [[R0:%rs[0-9]+]], [[AI0]], [[BZ0]];
+; CHECK-DAG:  or.b16          [[R1:%rs[0-9]+]], [[AI1]], [[BZ1]];
+; CHECK-DAG:  mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
 ; CHECK:      st.param.b32    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 {
@@ -1306,26 +1291,22 @@ define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 {
 }
 
 ; CHECK-LABEL: test_copysign_f64(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_copysign_f64_param_0];
+; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_copysign_f64_param_0];
 ; CHECK-DAG:  ld.param.v2.f64 {[[B0:%fd[0-9]+]], [[B1:%fd[0-9]+]]}, [test_copysign_f64_param_1];
-; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-DAG:  mov.b16         [[AS0:%rs[0-9]+]], [[A0]];
-; CHECK-DAG:  mov.b16         [[AS1:%rs[0-9]+]], [[A1]];
+; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
 ; CHECK-DAG:  mov.b64         [[BI0:%rd[0-9]+]], [[B0]];
 ; CHECK-DAG:  mov.b64         [[BI1:%rd[0-9]+]], [[B1]];
-; CHECK-DAG:  and.b16         [[AI0:%rs[0-9]+]], [[AS0]], 32767;
-; CHECK-DAG:  and.b16         [[AI1:%rs[0-9]+]], [[AS1]], 32767;
+; CHECK-DAG:  and.b16         [[AI0:%rs[0-9]+]], [[A0]], 32767;
+; CHECK-DAG:  and.b16         [[AI1:%rs[0-9]+]], [[A1]], 32767;
 ; CHECK-DAG:  and.b64         [[BX0:%rd[0-9]+]], [[BI0]], -9223372036854775808;
 ; CHECK-DAG:  and.b64         [[BX1:%rd[0-9]+]], [[BI1]], -9223372036854775808;
 ; CHECK-DAG:  shr.u64         [[BY0:%rd[0-9]+]], [[BX0]], 48;
 ; CHECK-DAG:  shr.u64         [[BY1:%rd[0-9]+]], [[BX1]], 48;
 ; CHECK-DAG:  cvt.u16.u64     [[BZ0:%rs[0-9]+]], [[BY0]];
 ; CHECK-DAG:  cvt.u16.u64     [[BZ1:%rs[0-9]+]], [[BY1]];
-; CHECK-DAG:  or.b16          [[RS0:%rs[0-9]+]], [[AI0]], [[BZ0]];
-; CHECK-DAG:  or.b16          [[RS1:%rs[0-9]+]], [[AI1]], [[BZ1]];
-; CHECK-DAG:  mov.b16         [[R0:%h[0-9]+]], [[RS0]];
-; CHECK-DAG:  mov.b16         [[R1:%h[0-9]+]], [[RS1]];
-; CHECK-DAG:  mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-DAG:  or.b16          [[R0:%rs[0-9]+]], [[AI0]], [[BZ0]];
+; CHECK-DAG:  or.b16          [[R1:%rs[0-9]+]], [[AI1]], [[BZ1]];
+; CHECK-DAG:  mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
 ; CHECK:      st.param.b32    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define <2 x half> @test_copysign_f64(<2 x half> %a, <2 x double> %b) #0 {
@@ -1335,24 +1316,18 @@ define <2 x half> @test_copysign_f64(<2 x half> %a, <2 x double> %b) #0 {
 }
 
 ; CHECK-LABEL: test_copysign_extended(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_copysign_extended_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_copysign_extended_param_1];
-; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-DAG:  mov.b32         {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-DAG:  mov.b16         [[AS0:%rs[0-9]+]], [[A0]];
-; CHECK-DAG:  mov.b16         [[AS1:%rs[0-9]+]], [[A1]];
-; CHECK-DAG:  mov.b16         [[BS0:%rs[0-9]+]], [[B0]];
-; CHECK-DAG:  mov.b16         [[BS1:%rs[0-9]+]], [[B1]];
-; CHECK-DAG:  and.b16         [[AX0:%rs[0-9]+]], [[AS0]], 32767;
-; CHECK-DAG:  and.b16         [[AX1:%rs[0-9]+]], [[AS1]], 32767;
-; CHECK-DAG:  and.b16         [[BX0:%rs[0-9]+]], [[BS0]], -32768;
-; CHECK-DAG:  and.b16         [[BX1:%rs[0-9]+]], [[BS1]], -32768;
-; CHECK-DAG:  or.b16          [[RS0:%rs[0-9]+]], [[AX0]], [[BX0]];
-; CHECK-DAG:  or.b16          [[RS1:%rs[0-9]+]], [[AX1]], [[BX1]];
-; CHECK-DAG:  mov.b16         [[R0:%h[0-9]+]], [[RS0]];
-; CHECK-DAG:  mov.b16         [[R1:%h[0-9]+]], [[RS1]];
-; CHECK-DAG:  mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      mov.b32         {[[RX0:%h[0-9]+]], [[RX1:%h[0-9]+]]}, [[R]]
+; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_copysign_extended_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_copysign_extended_param_1];
+; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32         {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
+; CHECK-DAG:  and.b16         [[AX0:%rs[0-9]+]], [[A0]], 32767;
+; CHECK-DAG:  and.b16         [[AX1:%rs[0-9]+]], [[A1]], 32767;
+; CHECK-DAG:  and.b16         [[BX0:%rs[0-9]+]], [[B0]], -32768;
+; CHECK-DAG:  and.b16         [[BX1:%rs[0-9]+]], [[B1]], -32768;
+; CHECK-DAG:  or.b16          [[R0:%rs[0-9]+]], [[AX0]], [[BX0]];
+; CHECK-DAG:  or.b16          [[R1:%rs[0-9]+]], [[AX1]], [[BX1]];
+; CHECK-DAG:  mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      mov.b32         {[[RX0:%rs[0-9]+]], [[RX1:%rs[0-9]+]]}, [[R]]
 ; CHECK-DAG:  cvt.f32.f16     [[XR0:%f[0-9]+]], [[RX0]];
 ; CHECK-DAG:  cvt.f32.f16     [[XR1:%f[0-9]+]], [[RX1]];
 ; CHECK:      st.param.v2.f32 [func_retval0+0], {[[XR0]], [[XR1]]};
@@ -1364,11 +1339,11 @@ define <2 x float> @test_copysign_extended(<2 x half> %a, <2 x half> %b) #0 {
 }
 
 ; CHECK-LABEL: test_floor(
-; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_floor_param_0];
-; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]];
-; CHECK-DAG:  cvt.rmi.f16.f16 [[R1:%h[0-9]+]], [[A1]];
-; CHECK-DAG:  cvt.rmi.f16.f16 [[R0:%h[0-9]+]], [[A0]];
-; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_floor_param_0];
+; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]];
+; CHECK-DAG:  cvt.rmi.f16.f16 [[R1:%rs[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.rmi.f16.f16 [[R0:%rs[0-9]+]], [[A0]];
+; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
 ; CHECK:      st.param.b32    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define <2 x half> @test_floor(<2 x half> %a) #0 {
@@ -1377,11 +1352,11 @@ define <2 x half> @test_floor(<2 x half> %a) #0 {
 }
 
 ; CHECK-LABEL: test_ceil(
-; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_ceil_param_0];
-; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]];
-; CHECK-DAG:  cvt.rpi.f16.f16 [[R1:%h[0-9]+]], [[A1]];
-; CHECK-DAG:  cvt.rpi.f16.f16 [[R0:%h[0-9]+]], [[A0]];
-; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_ceil_param_0];
+; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]];
+; CHECK-DAG:  cvt.rpi.f16.f16 [[R1:%rs[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.rpi.f16.f16 [[R0:%rs[0-9]+]], [[A0]];
+; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
 ; CHECK:      st.param.b32    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define <2 x half> @test_ceil(<2 x half> %a) #0 {
@@ -1390,11 +1365,11 @@ define <2 x half> @test_ceil(<2 x half> %a) #0 {
 }
 
 ; CHECK-LABEL: test_trunc(
-; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_trunc_param_0];
-; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]];
-; CHECK-DAG:  cvt.rzi.f16.f16 [[R1:%h[0-9]+]], [[A1]];
-; CHECK-DAG:  cvt.rzi.f16.f16 [[R0:%h[0-9]+]], [[A0]];
-; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_trunc_param_0];
+; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]];
+; CHECK-DAG:  cvt.rzi.f16.f16 [[R1:%rs[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.rzi.f16.f16 [[R0:%rs[0-9]+]], [[A0]];
+; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
 ; CHECK:      st.param.b32    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define <2 x half> @test_trunc(<2 x half> %a) #0 {
@@ -1403,11 +1378,11 @@ define <2 x half> @test_trunc(<2 x half> %a) #0 {
 }
 
 ; CHECK-LABEL: test_rint(
-; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_rint_param_0];
-; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]];
-; CHECK-DAG:  cvt.rni.f16.f16 [[R1:%h[0-9]+]], [[A1]];
-; CHECK-DAG:  cvt.rni.f16.f16 [[R0:%h[0-9]+]], [[A0]];
-; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_rint_param_0];
+; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]];
+; CHECK-DAG:  cvt.rni.f16.f16 [[R1:%rs[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.rni.f16.f16 [[R0:%rs[0-9]+]], [[A0]];
+; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
 ; CHECK:      st.param.b32    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define <2 x half> @test_rint(<2 x half> %a) #0 {
@@ -1416,11 +1391,11 @@ define <2 x half> @test_rint(<2 x half> %a) #0 {
 }
 
 ; CHECK-LABEL: test_nearbyint(
-; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_nearbyint_param_0];
-; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]];
-; CHECK-DAG:  cvt.rni.f16.f16 [[R1:%h[0-9]+]], [[A1]];
-; CHECK-DAG:  cvt.rni.f16.f16 [[R0:%h[0-9]+]], [[A0]];
-; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_nearbyint_param_0];
+; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]];
+; CHECK-DAG:  cvt.rni.f16.f16 [[R1:%rs[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.rni.f16.f16 [[R0:%rs[0-9]+]], [[A0]];
+; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
 ; CHECK:      st.param.b32    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define <2 x half> @test_nearbyint(<2 x half> %a) #0 {
@@ -1429,11 +1404,11 @@ define <2 x half> @test_nearbyint(<2 x half> %a) #0 {
 }
 
 ; CHECK-LABEL: test_roundeven(
-; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_roundeven_param_0];
-; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]];
-; CHECK-DAG:  cvt.rni.f16.f16 [[R1:%h[0-9]+]], [[A1]];
-; CHECK-DAG:  cvt.rni.f16.f16 [[R0:%h[0-9]+]], [[A0]];
-; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_roundeven_param_0];
+; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]];
+; CHECK-DAG:  cvt.rni.f16.f16 [[R1:%rs[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.rni.f16.f16 [[R0:%rs[0-9]+]], [[A0]];
+; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
 ; CHECK:      st.param.b32    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define <2 x half> @test_roundeven(<2 x half> %a) #0 {
@@ -1456,15 +1431,15 @@ define <2 x half> @test_round(<2 x half> %a) #0 {
 }
 
 ; CHECK-LABEL: test_fmuladd(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fmuladd_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fmuladd_param_1];
-; CHECK-DAG:  ld.param.b32    [[C:%hh[0-9]+]], [test_fmuladd_param_2];
+; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fmuladd_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fmuladd_param_1];
+; CHECK-DAG:  ld.param.b32    [[C:%r[0-9]+]], [test_fmuladd_param_2];
 ;
-; CHECK-F16:        fma.rn.f16x2   [[R:%hh[0-9]+]], [[A]], [[B]], [[C]];
+; CHECK-F16:        fma.rn.f16x2   [[R:%r[0-9]+]], [[A]], [[B]], [[C]];
 ;
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG:  mov.b32        {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]]
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[C0:%rs[0-9]+]], [[C1:%rs[0-9]+]]}, [[C]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FC0:%f[0-9]+]], [[C0]]
@@ -1473,9 +1448,9 @@ define <2 x half> @test_round(<2 x half> %a) #0 {
 ; CHECK-NOF16-DAG:  cvt.f32.f16    [[FC0:%f[0-9]+]], [[C0]]
 ; CHECK-NOF16-DAG:  fma.rn.f32     [[FR0:%f[0-9]+]], [[FA0]], [[FB0]], [[FC0]];
 ; CHECK-NOF16-DAG:  fma.rn.f32     [[FR1:%f[0-9]+]], [[FA1]], [[FB1]], [[FC1]];
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
-; CHECK-NOF16:      mov.b32        [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[FR1]]
+; CHECK-NOF16:      mov.b32        [[R:%r[0-9]+]], {[[R0]], [[R1]]}
 ;
 ; CHECK:      st.param.b32    [func_retval0+0], [[R]];
 ; CHECK:      ret;
@@ -1485,16 +1460,16 @@ define <2 x half> @test_fmuladd(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0
 }
 
 ; CHECK-LABEL: test_shufflevector(
-; CHECK: mov.b32 {%h1, %h2}, %hh1;
-; CHECK: mov.b32 %hh2, {%h2, %h1};
+; CHECK: mov.b32 {%rs1, %rs2}, %r1;
+; CHECK: mov.b32 %r2, {%rs2, %rs1};
 define <2 x half> @test_shufflevector(<2 x half> %a) #0 {
   %s = shufflevector <2 x half> %a, <2 x half> undef, <2 x i32> <i32 1, i32 0>
   ret <2 x half> %s
 }
 
 ; CHECK-LABEL: test_insertelement(
-; CHECK: mov.b32 {%h2, %tmp_hi}, %hh1;
-; CHECK: mov.b32 %hh2, {%h2, %h1};
+; CHECK: mov.b32 {%rs2, %tmp_hi}, %r1;
+; CHECK: mov.b32 %r2, {%rs2, %rs1};
 define <2 x half> @test_insertelement(<2 x half> %a, half %x) #0 {
   %i = insertelement <2 x half> %a, half %x, i64 1
   ret <2 x half> %i

diff  --git a/llvm/test/CodeGen/NVPTX/half.ll b/llvm/test/CodeGen/NVPTX/half.ll
index d9a3b95ef341e..92acc43914b1e 100644
--- a/llvm/test/CodeGen/NVPTX/half.ll
+++ b/llvm/test/CodeGen/NVPTX/half.ll
@@ -7,7 +7,7 @@
 
 define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; CHECK-LABEL: @test_load_store
-; CHECK: ld.global.b16 [[TMP:%h[0-9]+]], [{{%r[0-9]+}}]
+; CHECK: ld.global.b16 [[TMP:%rs[0-9]+]], [{{%r[0-9]+}}]
 ; CHECK: st.global.b16 [{{%r[0-9]+}}], [[TMP]]
   %val = load half, ptr addrspace(1) %in
   store half %val, ptr addrspace(1) %out
@@ -16,7 +16,7 @@ define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 
 define void @test_bitcast_from_half(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; CHECK-LABEL: @test_bitcast_from_half
-; CHECK: ld.global.b16 [[TMP:%h[0-9]+]], [{{%r[0-9]+}}]
+; CHECK: ld.global.b16 [[TMP:%rs[0-9]+]], [{{%r[0-9]+}}]
 ; CHECK: st.global.b16 [{{%r[0-9]+}}], [[TMP]]
   %val = load half, ptr addrspace(1) %in
   %val_int = bitcast half %val to i16

diff  --git a/llvm/test/CodeGen/NVPTX/intrinsics-sm90.ll b/llvm/test/CodeGen/NVPTX/intrinsics-sm90.ll
index ff2f60c04d370..a157616db9fb4 100644
--- a/llvm/test/CodeGen/NVPTX/intrinsics-sm90.ll
+++ b/llvm/test/CodeGen/NVPTX/intrinsics-sm90.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx80| FileCheck --check-prefixes=CHECK %s
-; RUN: %if ptxas-11.8 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %}
+; RUN: %if ptxas-12.0 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %}
 
 ; CHECK-LABEL: test_isspacep
 define i1 @test_isspacep_shared_cluster(ptr %p) {

diff  --git a/llvm/test/CodeGen/NVPTX/ld-st-addrrspace.py b/llvm/test/CodeGen/NVPTX/ld-st-addrrspace.py
index 67fabbac1d4e9..e6681fe744845 100644
--- a/llvm/test/CodeGen/NVPTX/ld-st-addrrspace.py
+++ b/llvm/test/CodeGen/NVPTX/ld-st-addrrspace.py
@@ -28,8 +28,8 @@
     "i16": "r",
     "i32": "r",
     "i64": "rd",
-    "half": "h",
-    "<2 x half>": "hh",
+    "half": "rs",
+    "<2 x half>": "r",
     "float": "f",
     "double": "fd",
 }

diff  --git a/llvm/test/CodeGen/NVPTX/ldg-invariant.ll b/llvm/test/CodeGen/NVPTX/ldg-invariant.ll
index ac33e3e1dc443..5d0db6f80a83d 100644
--- a/llvm/test/CodeGen/NVPTX/ldg-invariant.ll
+++ b/llvm/test/CodeGen/NVPTX/ldg-invariant.ll
@@ -15,7 +15,7 @@ define i32 @ld_global(ptr addrspace(1) %ptr) {
 define half @ld_global_v2f16(ptr addrspace(1) %ptr) {
 ; Load of v2f16 is weird. We consider it to be a legal type, which happens to be
 ; loaded/stored as a 32-bit scalar.
-; CHECK: ld.global.nc.b32
+; CHECK: ld.global.nc.u32
   %a = load <2 x half>, ptr addrspace(1) %ptr, !invariant.load !0
   %v1 = extractelement <2 x half> %a, i32 0
   %v2 = extractelement <2 x half> %a, i32 1
@@ -28,7 +28,7 @@ define half @ld_global_v4f16(ptr addrspace(1) %ptr) {
 ; Larger f16 vectors may be split into individual f16 elements and multiple
 ; loads/stores may be vectorized using f16 element type. Practically it's
 ; limited to v4 variant only.
-; CHECK: ld.global.nc.v4.b16
+; CHECK: ld.global.nc.v4.u16
   %a = load <4 x half>, ptr addrspace(1) %ptr, !invariant.load !0
   %v1 = extractelement <4 x half> %a, i32 0
   %v2 = extractelement <4 x half> %a, i32 1
@@ -44,7 +44,7 @@ define half @ld_global_v4f16(ptr addrspace(1) %ptr) {
 define half @ld_global_v8f16(ptr addrspace(1) %ptr) {
 ; Larger vectors are, again, loaded as v4i32. PTX has no v8 variants of loads/stores,
 ; so load/store vectorizer has to convert v8f16 -> v4 x v2f16.
-; CHECK: ld.global.nc.v4.b32
+; CHECK: ld.global.nc.v4.u32
   %a = load <8 x half>, ptr addrspace(1) %ptr, !invariant.load !0
   %v1 = extractelement <8 x half> %a, i32 0
   %v2 = extractelement <8 x half> %a, i32 2

diff  --git a/llvm/test/CodeGen/NVPTX/ldu-ldg.ll b/llvm/test/CodeGen/NVPTX/ldu-ldg.ll
index c152f835afe07..66f0954c34c83 100644
--- a/llvm/test/CodeGen/NVPTX/ldu-ldg.ll
+++ b/llvm/test/CodeGen/NVPTX/ldu-ldg.ll
@@ -1,11 +1,12 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_32 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_32 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_32 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_32 | %ptxas-verify %}
 
 
 declare i8 @llvm.nvvm.ldu.global.i.i8.p1(ptr addrspace(1) %ptr, i32 %align)
 declare i16 @llvm.nvvm.ldu.global.i.i16.p1(ptr addrspace(1) %ptr, i32 %align)
 declare i32 @llvm.nvvm.ldu.global.i.i32.p1(ptr addrspace(1) %ptr, i32 %align)
 declare i64 @llvm.nvvm.ldu.global.i.i64.p1(ptr addrspace(1) %ptr, i32 %align)
+declare ptr @llvm.nvvm.ldu.global.p.p1i8(ptr addrspace(1) %ptr, i32 %align)
 declare float @llvm.nvvm.ldu.global.f.f32.p1(ptr addrspace(1) %ptr, i32 %align)
 declare double @llvm.nvvm.ldu.global.f.f64.p1(ptr addrspace(1) %ptr, i32 %align)
 declare half @llvm.nvvm.ldu.global.f.f16.p1(ptr addrspace(1) %ptr, i32 %align)
@@ -15,6 +16,7 @@ declare i8 @llvm.nvvm.ldg.global.i.i8.p1(ptr addrspace(1) %ptr, i32 %align)
 declare i16 @llvm.nvvm.ldg.global.i.i16.p1(ptr addrspace(1) %ptr, i32 %align)
 declare i32 @llvm.nvvm.ldg.global.i.i32.p1(ptr addrspace(1) %ptr, i32 %align)
 declare i64 @llvm.nvvm.ldg.global.i.i64.p1(ptr addrspace(1) %ptr, i32 %align)
+declare ptr @llvm.nvvm.ldg.global.p.p1i8(ptr addrspace(1) %ptr, i32 %align)
 declare float @llvm.nvvm.ldg.global.f.f32.p1(ptr addrspace(1) %ptr, i32 %align)
 declare double @llvm.nvvm.ldg.global.f.f64.p1(ptr addrspace(1) %ptr, i32 %align)
 declare half @llvm.nvvm.ldg.global.f.f16.p1(ptr addrspace(1) %ptr, i32 %align)
@@ -48,6 +50,14 @@ define i64 @test_ldu_i64(ptr addrspace(1) %ptr) {
   ret i64 %val
 }
 
+; CHECK-LABEL: test_ldu_p
+define ptr @test_ldu_p(ptr addrspace(1) %ptr) {
+  ; CHECK: ldu.global.u64
+  %val = tail call ptr @llvm.nvvm.ldu.global.p.p1i8(ptr addrspace(1) %ptr, i32 8)
+  ret ptr %val
+}
+
+
 ; CHECK-LABEL: test_ldu_f32
 define float @test_ldu_f32(ptr addrspace(1) %ptr) {
   ; CHECK: ldu.global.f32
@@ -64,14 +74,14 @@ define double @test_ldu_f64(ptr addrspace(1) %ptr) {
 
 ; CHECK-LABEL: test_ldu_f16
 define half @test_ldu_f16(ptr addrspace(1) %ptr) {
-  ; CHECK: ldu.global.b16
+  ; CHECK: ldu.global.u16
   %val = tail call half @llvm.nvvm.ldu.global.f.f16.p1(ptr addrspace(1) %ptr, i32 2)
   ret half %val
 }
 
 ; CHECK-LABEL: test_ldu_v2f16
 define <2 x half> @test_ldu_v2f16(ptr addrspace(1) %ptr) {
-  ; CHECK: ldu.global.b32
+  ; CHECK: ldu.global.u32
   %val = tail call <2 x half> @llvm.nvvm.ldu.global.f.v2f16.p1(ptr addrspace(1) %ptr, i32 4)
   ret <2 x half> %val
 }
@@ -104,6 +114,13 @@ define i64 @test_ldg_i64(ptr addrspace(1) %ptr) {
   ret i64 %val
 }
 
+; CHECK-LABEL: test_ldg_p
+define ptr @test_ldg_p(ptr addrspace(1) %ptr) {
+  ; CHECK: ld.global.nc.u64
+  %val = tail call ptr @llvm.nvvm.ldg.global.p.p1i8(ptr addrspace(1) %ptr, i32 8)
+  ret ptr %val
+}
+
 ; CHECK-LABEL: test_ldg_f32
 define float @test_ldg_f32(ptr addrspace(1) %ptr) {
   ; CHECK: ld.global.nc.f32
@@ -120,14 +137,14 @@ define double @test_ldg_f64(ptr addrspace(1) %ptr) {
 
 ; CHECK-LABEL: test_ldg_f16
 define half @test_ldg_f16(ptr addrspace(1) %ptr) {
-  ; CHECK: ld.global.nc.b16
+  ; CHECK: ld.global.nc.u16
   %val = tail call half @llvm.nvvm.ldg.global.f.f16.p1(ptr addrspace(1) %ptr, i32 2)
   ret half %val
 }
 
 ; CHECK-LABEL: test_ldg_v2f16
 define <2 x half> @test_ldg_v2f16(ptr addrspace(1) %ptr) {
-  ; CHECK: ld.global.nc.b32
+  ; CHECK: ld.global.nc.u32
   %val = tail call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p1(ptr addrspace(1) %ptr, i32 4)
   ret <2 x half> %val
 }

diff  --git a/llvm/test/CodeGen/NVPTX/param-load-store.ll b/llvm/test/CodeGen/NVPTX/param-load-store.ll
index f2ff7e3fd7afb..b05fbaea17087 100644
--- a/llvm/test/CodeGen/NVPTX/param-load-store.ll
+++ b/llvm/test/CodeGen/NVPTX/param-load-store.ll
@@ -384,13 +384,13 @@ define <5 x i16> @test_v5i16(<5 x i16> %a) {
 ; CHECK: .func  (.param .b32 func_retval0)
 ; CHECK-LABEL: test_f16(
 ; CHECK-NEXT: .param .b32 test_f16_param_0
-; CHECK:      ld.param.b16    [[E:%h[0-9]+]], [test_f16_param_0];
+; CHECK:      ld.param.b16    [[E:%rs[0-9]+]], [test_f16_param_0];
 ; CHECK:      .param .b32 param0;
 ; CHECK:      st.param.b16    [param0+0], [[E]];
 ; CHECK:      .param .b32 retval0;
 ; CHECK:      call.uni (retval0),
 ; CHECK-NEXT: test_f16,
-; CHECK:      ld.param.b16    [[R:%h[0-9]+]], [retval0+0];
+; CHECK:      ld.param.b16    [[R:%rs[0-9]+]], [retval0+0];
 ; CHECK:      st.param.b16    [func_retval0+0], [[R]]
 ; CHECK-NEXT: ret;
 define half @test_f16(half %a) {
@@ -401,13 +401,13 @@ define half @test_f16(half %a) {
 ; CHECK: .func  (.param .align 4 .b8 func_retval0[4])
 ; CHECK-LABEL: test_v2f16(
 ; CHECK-NEXT: .param .align 4 .b8 test_v2f16_param_0[4]
-; CHECK:      ld.param.b32    [[E:%hh[0-9]+]], [test_v2f16_param_0];
+; CHECK:      ld.param.b32    [[E:%r[0-9]+]], [test_v2f16_param_0];
 ; CHECK:      .param .align 4 .b8 param0[4];
 ; CHECK:      st.param.b32    [param0+0], [[E]];
 ; CHECK:      .param .align 4 .b8 retval0[4];
 ; CHECK:      call.uni (retval0),
 ; CHECK-NEXT: test_v2f16,
-; CHECK:      ld.param.b32    [[R:%hh[0-9]+]], [retval0+0];
+; CHECK:      ld.param.b32    [[R:%r[0-9]+]], [retval0+0];
 ; CHECK:      st.param.b32    [func_retval0+0], [[R]]
 ; CHECK-NEXT: ret;
 define <2 x half> @test_v2f16(<2 x half> %a) {
@@ -418,17 +418,17 @@ define <2 x half> @test_v2f16(<2 x half> %a) {
 ; CHECK:.func  (.param .align 8 .b8 func_retval0[8])
 ; CHECK-LABEL: test_v3f16(
 ; CHECK:      .param .align 8 .b8 test_v3f16_param_0[8]
-; CHECK-DAG:  ld.param.b32    [[HH01:%hh[0-9]+]], [test_v3f16_param_0];
-; CHECK-DAG:  mov.b32         {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[HH01]];
-; CHECK-DAG:  ld.param.b16    [[E2:%h[0-9]+]], [test_v3f16_param_0+4];
+; CHECK-DAG:  ld.param.b32    [[HH01:%r[0-9]+]], [test_v3f16_param_0];
+; CHECK-DAG:  mov.b32         {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [[HH01]];
+; CHECK-DAG:  ld.param.b16    [[E2:%rs[0-9]+]], [test_v3f16_param_0+4];
 ; CHECK:      .param .align 8 .b8 param0[8];
 ; CHECK-DAG:  st.param.v2.b16 [param0+0], {[[E0]], [[E1]]};
 ; CHECK-DAG:  st.param.b16    [param0+4], [[E2]];
 ; CHECK:      .param .align 8 .b8 retval0[8];
 ; CHECK:      call.uni (retval0),
 ; CHECK:      test_v3f16,
-; CHECK-DAG:  ld.param.v2.b16 {[[R0:%h[0-9]+]], [[R1:%h[0-9]+]]}, [retval0+0];
-; CHECK-DAG:  ld.param.b16    [[R2:%h[0-9]+]], [retval0+4];
+; CHECK-DAG:  ld.param.v2.b16 {[[R0:%rs[0-9]+]], [[R1:%rs[0-9]+]]}, [retval0+0];
+; CHECK-DAG:  ld.param.b16    [[R2:%rs[0-9]+]], [retval0+4];
 ; CHECK-DAG:  st.param.v2.b16 [func_retval0+0], {[[R0]], [[R1]]};
 ; CHECK-DAG:  st.param.b16    [func_retval0+4], [[R2]];
 ; CHECK:      ret;
@@ -441,14 +441,12 @@ define <3 x half> @test_v3f16(<3 x half> %a) {
 ; CHECK-LABEL: test_v4f16(
 ; CHECK:      .param .align 8 .b8 test_v4f16_param_0[8]
 ; CHECK:      ld.param.v2.u32 {[[R01:%r[0-9]+]], [[R23:%r[0-9]+]]}, [test_v4f16_param_0];
-; CHECK-DAG:  mov.b32         [[HH01:%hh[0-9]+]], [[R01]];
-; CHECK-DAG:  mov.b32         [[HH23:%hh[0-9]+]], [[R23]];
 ; CHECK:      .param .align 8 .b8 param0[8];
-; CHECK:      st.param.v2.b32 [param0+0], {[[HH01]], [[HH23]]};
+; CHECK:      st.param.v2.b32 [param0+0], {[[R01]], [[R23]]};
 ; CHECK:      .param .align 8 .b8 retval0[8];
 ; CHECK:      call.uni (retval0),
 ; CHECK:      test_v4f16,
-; CHECK:      ld.param.v2.b32 {[[RH01:%hh[0-9]+]], [[RH23:%hh[0-9]+]]}, [retval0+0];
+; CHECK:      ld.param.v2.b32 {[[RH01:%r[0-9]+]], [[RH23:%r[0-9]+]]}, [retval0+0];
 ; CHECK:      st.param.v2.b32 [func_retval0+0], {[[RH01]], [[RH23]]};
 ; CHECK:      ret;
 define <4 x half> @test_v4f16(<4 x half> %a) {
@@ -459,17 +457,17 @@ define <4 x half> @test_v4f16(<4 x half> %a) {
 ; CHECK:.func  (.param .align 16 .b8 func_retval0[16])
 ; CHECK-LABEL: test_v5f16(
 ; CHECK:      .param .align 16 .b8 test_v5f16_param_0[16]
-; CHECK-DAG:  ld.param.v4.b16  {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]], [[E2:%h[0-9]+]], [[E3:%h[0-9]+]]}, [test_v5f16_param_0];
-; CHECK-DAG:  mov.b32         {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[HH01]];
-; CHECK-DAG:  ld.param.b16    [[E4:%h[0-9]+]], [test_v5f16_param_0+8];
+; CHECK-DAG:  ld.param.v4.b16  {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5f16_param_0];
+; CHECK-DAG:  mov.b32         {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [[HH01]];
+; CHECK-DAG:  ld.param.b16    [[E4:%rs[0-9]+]], [test_v5f16_param_0+8];
 ; CHECK:      .param .align 16 .b8 param0[16];
 ; CHECK-DAG:  st.param.v4.b16 [param0+0],
 ; CHECK-DAG:  st.param.b16    [param0+8], [[E4]];
 ; CHECK:      .param .align 16 .b8 retval0[16];
 ; CHECK:      call.uni (retval0),
 ; CHECK:      test_v5f16,
-; CHECK-DAG:  ld.param.v4.b16 {[[R0:%h[0-9]+]], [[R1:%h[0-9]+]], [[R2:%h[0-9]+]], [[R3:%h[0-9]+]]}, [retval0+0];
-; CHECK-DAG:  ld.param.b16    [[R4:%h[0-9]+]], [retval0+8];
+; CHECK-DAG:  ld.param.v4.b16 {[[R0:%rs[0-9]+]], [[R1:%rs[0-9]+]], [[R2:%rs[0-9]+]], [[R3:%rs[0-9]+]]}, [retval0+0];
+; CHECK-DAG:  ld.param.b16    [[R4:%rs[0-9]+]], [retval0+8];
 ; CHECK-DAG:  st.param.v4.b16 [func_retval0+0], {[[R0]], [[R1]], [[R2]], [[R3]]};
 ; CHECK-DAG:  st.param.b16    [func_retval0+8], [[R4]];
 ; CHECK:      ret;
@@ -482,16 +480,12 @@ define <5 x half> @test_v5f16(<5 x half> %a) {
 ; CHECK-LABEL: test_v8f16(
 ; CHECK:      .param .align 16 .b8 test_v8f16_param_0[16]
 ; CHECK:      ld.param.v4.u32 {[[R01:%r[0-9]+]], [[R23:%r[0-9]+]], [[R45:%r[0-9]+]], [[R67:%r[0-9]+]]}, [test_v8f16_param_0];
-; CHECK-DAG:  mov.b32         [[HH01:%hh[0-9]+]], [[R01]];
-; CHECK-DAG:  mov.b32         [[HH23:%hh[0-9]+]], [[R23]];
-; CHECK-DAG:  mov.b32         [[HH45:%hh[0-9]+]], [[R45]];
-; CHECK-DAG:  mov.b32         [[HH67:%hh[0-9]+]], [[R67]];
 ; CHECK:      .param .align 16 .b8 param0[16];
-; CHECK:      st.param.v4.b32 [param0+0], {[[HH01]], [[HH23]], [[HH45]], [[HH67]]};
+; CHECK:      st.param.v4.b32 [param0+0], {[[R01]], [[R23]], [[R45]], [[R67]]};
 ; CHECK:      .param .align 16 .b8 retval0[16];
 ; CHECK:      call.uni (retval0),
 ; CHECK:      test_v8f16,
-; CHECK:      ld.param.v4.b32 {[[RH01:%hh[0-9]+]], [[RH23:%hh[0-9]+]], [[RH45:%hh[0-9]+]], [[RH67:%hh[0-9]+]]}, [retval0+0];
+; CHECK:      ld.param.v4.b32 {[[RH01:%r[0-9]+]], [[RH23:%r[0-9]+]], [[RH45:%r[0-9]+]], [[RH67:%r[0-9]+]]}, [retval0+0];
 ; CHECK:      st.param.v4.b32 [func_retval0+0], {[[RH01]], [[RH23]], [[RH45]], [[RH67]]};
 ; CHECK:      ret;
 define <8 x half> @test_v8f16(<8 x half> %a) {
@@ -502,9 +496,9 @@ define <8 x half> @test_v8f16(<8 x half> %a) {
 ; CHECK:.func  (.param .align 32 .b8 func_retval0[32])
 ; CHECK-LABEL: test_v9f16(
 ; CHECK:      .param .align 32 .b8 test_v9f16_param_0[32]
-; CHECK-DAG:  ld.param.v4.b16  {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]], [[E2:%h[0-9]+]], [[E3:%h[0-9]+]]}, [test_v9f16_param_0];
-; CHECK-DAG:  ld.param.v4.b16  {[[E4:%h[0-9]+]], [[E5:%h[0-9]+]], [[E6:%h[0-9]+]], [[E7:%h[0-9]+]]}, [test_v9f16_param_0+8];
-; CHECK-DAG:  ld.param.b16     [[E8:%h[0-9]+]], [test_v9f16_param_0+16];
+; CHECK-DAG:  ld.param.v4.b16  {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v9f16_param_0];
+; CHECK-DAG:  ld.param.v4.b16  {[[E4:%rs[0-9]+]], [[E5:%rs[0-9]+]], [[E6:%rs[0-9]+]], [[E7:%rs[0-9]+]]}, [test_v9f16_param_0+8];
+; CHECK-DAG:  ld.param.b16     [[E8:%rs[0-9]+]], [test_v9f16_param_0+16];
 ; CHECK:      .param .align 32 .b8 param0[32];
 ; CHECK-DAG:  st.param.v4.b16 [param0+0],
 ; CHECK-DAG:  st.param.v4.b16 [param0+8],
@@ -512,9 +506,9 @@ define <8 x half> @test_v8f16(<8 x half> %a) {
 ; CHECK:      .param .align 32 .b8 retval0[32];
 ; CHECK:      call.uni (retval0),
 ; CHECK:      test_v9f16,
-; CHECK-DAG:  ld.param.v4.b16 {[[R0:%h[0-9]+]], [[R1:%h[0-9]+]], [[R2:%h[0-9]+]], [[R3:%h[0-9]+]]}, [retval0+0];
-; CHECK-DAG:  ld.param.v4.b16 {[[R4:%h[0-9]+]], [[R5:%h[0-9]+]], [[R6:%h[0-9]+]], [[R7:%h[0-9]+]]}, [retval0+8];
-; CHECK-DAG:  ld.param.b16    [[R8:%h[0-9]+]], [retval0+16];
+; CHECK-DAG:  ld.param.v4.b16 {[[R0:%rs[0-9]+]], [[R1:%rs[0-9]+]], [[R2:%rs[0-9]+]], [[R3:%rs[0-9]+]]}, [retval0+0];
+; CHECK-DAG:  ld.param.v4.b16 {[[R4:%rs[0-9]+]], [[R5:%rs[0-9]+]], [[R6:%rs[0-9]+]], [[R7:%rs[0-9]+]]}, [retval0+8];
+; CHECK-DAG:  ld.param.b16    [[R8:%rs[0-9]+]], [retval0+16];
 ; CHECK-DAG:  st.param.v4.b16 [func_retval0+0], {[[R0]], [[R1]], [[R2]], [[R3]]};
 ; CHECK-DAG:  st.param.v4.b16 [func_retval0+8], {[[R4]], [[R5]], [[R6]], [[R7]]};
 ; CHECK-DAG:  st.param.b16    [func_retval0+16], [[R8]];
@@ -915,13 +909,13 @@ define %s_i16 @test_s_i16(%s_i16 %a) {
 ; CHECK: .func  (.param .align 2 .b8 func_retval0[2])
 ; CHECK-LABEL: test_s_f16(
 ; CHECK-NEXT: .param .align 2 .b8 test_s_f16_param_0[2]
-; CHECK:      ld.param.b16 [[A:%h[0-9]+]], [test_s_f16_param_0];
+; CHECK:      ld.param.b16 [[A:%rs[0-9]+]], [test_s_f16_param_0];
 ; CHECK:      .param .align 2 .b8 param0[2];
 ; CHECK:      st.param.b16    [param0+0], [[A]]
 ; CHECK:      .param .align 2 .b8 retval0[2];
 ; CHECK:      call.uni
 ; CHECK-NEXT: test_s_f16,
-; CHECK:      ld.param.b16    [[R:%h[0-9]+]], [retval0+0];
+; CHECK:      ld.param.b16    [[R:%rs[0-9]+]], [retval0+0];
 ; CHECK:      st.param.b16    [func_retval0+0], [[R]];
 ; CHECK-NEXT: ret;
 define %s_f16 @test_s_f16(%s_f16 %a) {

diff  --git a/llvm/test/CodeGen/NVPTX/proxy-reg-erasure-ptx.ll b/llvm/test/CodeGen/NVPTX/proxy-reg-erasure-ptx.ll
index 7a3b29b20c4e9..fa138f3d0936e 100644
--- a/llvm/test/CodeGen/NVPTX/proxy-reg-erasure-ptx.ll
+++ b/llvm/test/CodeGen/NVPTX/proxy-reg-erasure-ptx.ll
@@ -92,10 +92,10 @@ declare half @callee_f16()
 define  half @check_f16() {
   ; PTX-LABEL: check_f16
   ; PTX-DAG: { // callseq {{[0-9]+}}, {{[0-9]+}}
-  ; PTX-DAG: ld.param.b16 [[LD:%h[0-9]+]], [retval0+0];
+  ; PTX-DAG: ld.param.b16 [[LD:%rs[0-9]+]], [retval0+0];
   ; PTX-DAG: } // callseq {{[0-9]+}}
 
-  ; PTX-WITHOUT-DAG: mov.b16 [[PROXY:%h[0-9]+]], [[LD]];
+  ; PTX-WITHOUT-DAG: mov.b16 [[PROXY:%rs[0-9]+]], [[LD]];
   ; PTX-WITHOUT-DAG: st.param.b16 [func_retval0+0], [[PROXY]];
   ; PTX-WITH-DAG:    st.param.b16 [func_retval0+0], [[LD]];
 
@@ -155,10 +155,10 @@ declare <2 x half> @callee_vec_f16()
 define  <2 x half> @check_vec_f16() {
   ; PTX-LABEL: check_vec_f16
   ; PTX-DAG: { // callseq {{[0-9]+}}, {{[0-9]+}}
-  ; PTX-DAG: ld.param.b32 [[LD:%hh[0-9]+]], [retval0+0];
+  ; PTX-DAG: ld.param.b32 [[LD:%r[0-9]+]], [retval0+0];
   ; PTX-DAG: } // callseq {{[0-9]+}}
 
-  ; PTX-WITHOUT-DAG: mov.b32 [[PROXY:%hh[0-9]+]], [[LD]];
+  ; PTX-WITHOUT-DAG: mov.b32 [[PROXY:%r[0-9]+]], [[LD]];
   ; PTX-WITHOUT-DAG: st.param.b32 [func_retval0+0], [[PROXY]];
   ; PTX-WITH-DAG:    st.param.b32 [func_retval0+0], [[LD]];
 

diff  --git a/llvm/test/CodeGen/NVPTX/wmma.py b/llvm/test/CodeGen/NVPTX/wmma.py
index 928abe8795a7b..b7153d684671f 100644
--- a/llvm/test/CodeGen/NVPTX/wmma.py
+++ b/llvm/test/CodeGen/NVPTX/wmma.py
@@ -118,7 +118,7 @@ def __init__(self, ptx_type):
         }[ptx_type]
 
         self.ptx_reg_pattern = {
-            "f16": "%hh[0-9]+",
+            "f16": "%r[0-9]+",
             "f32": "%f[0-9]+",
             "f64": "%fd[0-9]+",
         }.get(ptx_type, "%r[0-9]+")


        


More information about the llvm-commits mailing list